This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

house= read.csv('kc_house_data.csv',TRUE,",")
df<-subset(house,select=-c(id))
head(df)

Appropriate function to extract the year and month into separate variables

df$year=substr(df$date,0,4)

df$month=substr(df$date,5,6)

df<-subset(df,select=-c(date))

Run the models

null_model<-lm(price~1,data=df)
summary(null_model)
## 
## Call:
## lm(formula = price ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465088 -218138  -90088  104912 7159912 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   540088       2497   216.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
## 
## Call:
## lm(formula = price ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1320439   -98332    -9168    77291  4332206 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.590e+06  2.922e+06   2.255  0.02413 *  
## bedrooms      -3.593e+04  1.886e+03 -19.052  < 2e-16 ***
## bathrooms      4.130e+04  3.244e+03  12.732  < 2e-16 ***
## sqft_living    1.503e+02  4.372e+00  34.391  < 2e-16 ***
## sqft_lot       1.298e-01  4.778e-02   2.716  0.00661 ** 
## floors         6.969e+03  3.585e+03   1.944  0.05191 .  
## waterfront     5.833e+05  1.730e+04  33.705  < 2e-16 ***
## view           5.267e+04  2.133e+03  24.687  < 2e-16 ***
## condition      2.771e+04  2.348e+03  11.798  < 2e-16 ***
## grade          9.606e+04  2.146e+03  44.756  < 2e-16 ***
## sqft_above     3.131e+01  4.347e+00   7.203 6.07e-13 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -2.614e+03  7.243e+01 -36.084  < 2e-16 ***
## yr_renovated   2.076e+01  3.646e+00   5.694 1.25e-08 ***
## zipcode       -5.842e+02  3.288e+01 -17.764  < 2e-16 ***
## lat            6.050e+05  1.070e+04  56.525  < 2e-16 ***
## long          -2.153e+05  1.309e+04 -16.447  < 2e-16 ***
## sqft_living15  2.149e+01  3.437e+00   6.251 4.16e-10 ***
## sqft_lot15    -3.905e-01  7.305e-02  -5.345 9.11e-08 ***
## year2015       6.003e+04  9.231e+03   6.503 8.03e-11 ***
## month02        9.277e+03  8.567e+03   1.083  0.27890    
## month03        3.508e+04  7.914e+03   4.433 9.34e-06 ***
## month04        3.662e+04  7.696e+03   4.758 1.96e-06 ***
## month05        5.431e+04  1.017e+04   5.338 9.51e-08 ***
## month06        6.139e+04  1.203e+04   5.102 3.39e-07 ***
## month07        5.739e+04  1.202e+04   4.773 1.83e-06 ***
## month08        5.974e+04  1.213e+04   4.926 8.46e-07 ***
## month09        5.404e+04  1.221e+04   4.427 9.58e-06 ***
## month10        6.151e+04  1.216e+04   5.060 4.23e-07 ***
## month11        5.722e+04  1.245e+04   4.598 4.29e-06 ***
## month12        5.190e+04  1.240e+04   4.186 2.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 200600 on 21583 degrees of freedom
## Multiple R-squared:  0.7019, Adjusted R-squared:  0.7015 
## F-statistic:  1753 on 29 and 21583 DF,  p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start:  AIC=553875.8
## price ~ 1
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living    1 1.4356e+15 1.4773e+15 539204
## + grade          1 1.2976e+15 1.6153e+15 541134
## + sqft_above     1 1.0682e+15 1.8447e+15 544004
## + sqft_living15  1 9.9816e+14 1.9148e+15 544810
## + bathrooms      1 8.0329e+14 2.1096e+15 546904
## + view           1 4.5978e+14 2.4531e+15 550165
## + sqft_basement  1 3.0544e+14 2.6075e+15 551484
## + bedrooms       1 2.7696e+14 2.6360e+15 551718
## + lat            1 2.7455e+14 2.6384e+15 551738
## + waterfront     1 2.0668e+14 2.7062e+15 552287
## + floors         1 1.9209e+14 2.7208e+15 552403
## + yr_renovated   1 4.6564e+13 2.8664e+15 553529
## + sqft_lot       1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15     1 1.9801e+13 2.8931e+15 553730
## + yr_built       1 8.4977e+12 2.9044e+15 553815
## + zipcode        1 8.2451e+12 2.9047e+15 553817
## + condition      1 3.8514e+12 2.9091e+15 553849
## + month         11 4.6632e+12 2.9083e+15 553863
## + long           1 1.3624e+12 2.9116e+15 553868
## <none>                        2.9129e+15 553876
## + year           1 3.7251e+10 2.9129e+15 553878
## 
## Step:  AIC=539203.5
## price ~ sqft_living
## 
##                 Df  Sum of Sq        RSS    AIC
## + lat            1 2.1314e+14 1.2641e+15 535838
## + view           1 1.2362e+14 1.3537e+15 537317
## + grade          1 1.2132e+14 1.3560e+15 537353
## + waterfront     1 1.1024e+14 1.3670e+15 537529
## + yr_built       1 9.2854e+13 1.3844e+15 537802
## + long           1 6.6817e+13 1.4105e+15 538205
## + bedrooms       1 4.0635e+13 1.4366e+15 538603
## + zipcode        1 2.2858e+13 1.4544e+15 538868
## + yr_renovated   1 2.2405e+13 1.4549e+15 538875
## + sqft_living15  1 2.0109e+13 1.4572e+15 538909
## + condition      1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15     1 6.4407e+12 1.4708e+15 539111
## + sqft_lot       1 3.0113e+12 1.4743e+15 539161
## + month         11 4.0868e+12 1.4732e+15 539166
## + year           1 1.6739e+12 1.4756e+15 539181
## + sqft_above     1 1.2165e+12 1.4761e+15 539188
## + sqft_basement  1 1.2165e+12 1.4761e+15 539188
## + floors         1 2.2991e+11 1.4770e+15 539202
## + bathrooms      1 1.4719e+11 1.4771e+15 539203
## <none>                        1.4773e+15 539204
## 
## Step:  AIC=535838
## price ~ sqft_living + lat
## 
##                 Df  Sum of Sq        RSS    AIC
## + view           1 1.2663e+14 1.1375e+15 533559
## + waterfront     1 1.1646e+14 1.1477e+15 533751
## + grade          1 8.8423e+13 1.1757e+15 534273
## + yr_built       1 5.1904e+13 1.2122e+15 534934
## + long           1 3.6167e+13 1.2280e+15 535213
## + bedrooms       1 3.2254e+13 1.2319e+15 535281
## + condition      1 1.9095e+13 1.2450e+15 535511
## + yr_renovated   1 1.8897e+13 1.2452e+15 535515
## + sqft_living15  1 1.8325e+13 1.2458e+15 535524
## + year           1 2.8880e+12 1.2613e+15 535791
## + month         11 4.0367e+12 1.2601e+15 535791
## + sqft_lot15     1 1.2429e+12 1.2629e+15 535819
## + zipcode        1 4.4621e+11 1.2637e+15 535832
## <none>                        1.2641e+15 535838
## + sqft_lot       1 1.0913e+11 1.2640e+15 535838
## + sqft_above     1 1.0387e+11 1.2640e+15 535838
## + sqft_basement  1 1.0387e+11 1.2640e+15 535838
## + bathrooms      1 2.2942e+09 1.2641e+15 535840
## + floors         1 2.9322e+07 1.2641e+15 535840
## 
## Step:  AIC=533558.7
## price ~ sqft_living + lat + view
## 
##                 Df  Sum of Sq        RSS    AIC
## + grade          1 7.7085e+13 1.0604e+15 532044
## + waterfront     1 4.8301e+13 1.0892e+15 532623
## + yr_built       1 2.9685e+13 1.1078e+15 532989
## + bedrooms       1 2.0105e+13 1.1174e+15 533175
## + long           1 1.8126e+13 1.1194e+15 533214
## + condition      1 1.3259e+13 1.1242e+15 533307
## + yr_renovated   1 1.1033e+13 1.1265e+15 533350
## + sqft_living15  1 9.7773e+12 1.1277e+15 533374
## + sqft_above     1 5.6493e+12 1.1319e+15 533453
## + sqft_basement  1 5.6493e+12 1.1319e+15 533453
## + month         11 3.7455e+12 1.1338e+15 533509
## + year           1 2.5256e+12 1.1350e+15 533513
## + sqft_lot15     1 1.8222e+12 1.1357e+15 533526
## + zipcode        1 1.3136e+12 1.1362e+15 533536
## + floors         1 7.9084e+11 1.1367e+15 533546
## + sqft_lot       1 3.9207e+11 1.1371e+15 533553
## + bathrooms      1 1.9270e+11 1.1373e+15 533557
## <none>                        1.1375e+15 533559
## 
## Step:  AIC=532044.1
## price ~ sqft_living + lat + view + grade
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_built       1 8.9146e+13 9.7128e+14 530148
## + waterfront     1 5.0218e+13 1.0102e+15 530998
## + condition      1 2.5997e+13 1.0344e+15 531510
## + long           1 2.2309e+13 1.0381e+15 531587
## + yr_renovated   1 1.4312e+13 1.0461e+15 531752
## + bedrooms       1 1.0398e+13 1.0500e+15 531833
## + floors         1 3.9309e+12 1.0565e+15 531966
## + year           1 2.8187e+12 1.0576e+15 531989
## + month         11 3.5927e+12 1.0568e+15 531993
## + bathrooms      1 2.2781e+12 1.0581e+15 532000
## + sqft_lot15     1 1.3272e+12 1.0591e+15 532019
## + sqft_lot       1 2.0910e+11 1.0602e+15 532042
## + sqft_above     1 1.3720e+11 1.0603e+15 532043
## + sqft_basement  1 1.3720e+11 1.0603e+15 532043
## + sqft_living15  1 1.1809e+11 1.0603e+15 532044
## <none>                        1.0604e+15 532044
## + zipcode        1 7.8101e+10 1.0603e+15 532045
## 
## Step:  AIC=530148.3
## price ~ sqft_living + lat + view + grade + yr_built
## 
##                 Df  Sum of Sq        RSS    AIC
## + waterfront     1 5.0449e+13 9.2083e+14 528997
## + bedrooms       1 1.1098e+13 9.6018e+14 529902
## + zipcode        1 6.4623e+12 9.6481e+14 530006
## + bathrooms      1 5.2656e+12 9.6601e+14 530033
## + condition      1 4.2739e+12 9.6700e+14 530055
## + year           1 3.3309e+12 9.6795e+14 530076
## + month         11 3.8104e+12 9.6747e+14 530085
## + long           1 2.8391e+12 9.6844e+14 530087
## + yr_renovated   1 2.3436e+12 9.6893e+14 530098
## + floors         1 2.1809e+12 9.6910e+14 530102
## + sqft_above     1 2.1769e+12 9.6910e+14 530102
## + sqft_basement  1 2.1769e+12 9.6910e+14 530102
## + sqft_lot15     1 1.1384e+12 9.7014e+14 530125
## + sqft_living15  1 6.4656e+11 9.7063e+14 530136
## + sqft_lot       1 2.8898e+11 9.7099e+14 530144
## <none>                        9.7128e+14 530148
## 
## Step:  AIC=528997.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront
## 
##                 Df  Sum of Sq        RSS    AIC
## + bedrooms       1 9.0057e+12 9.1182e+14 528787
## + zipcode        1 6.3395e+12 9.1449e+14 528850
## + bathrooms      1 5.4031e+12 9.1543e+14 528872
## + condition      1 4.4331e+12 9.1639e+14 528895
## + year           1 3.4936e+12 9.1733e+14 528917
## + month         11 3.8991e+12 9.1693e+14 528928
## + long           1 2.5647e+12 9.1826e+14 528939
## + floors         1 1.6628e+12 9.1917e+14 528960
## + sqft_above     1 1.4511e+12 9.1938e+14 528965
## + sqft_basement  1 1.4511e+12 9.1938e+14 528965
## + yr_renovated   1 1.2489e+12 9.1958e+14 528970
## + sqft_lot15     1 1.1644e+12 9.1966e+14 528972
## + sqft_living15  1 9.9743e+11 9.1983e+14 528976
## + sqft_lot       1 2.2143e+11 9.2061e+14 528994
## <none>                        9.2083e+14 528997
## 
## Step:  AIC=528787
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + bathrooms      1 9.0102e+12 9.0281e+14 528574
## + zipcode        1 6.9168e+12 9.0491e+14 528624
## + condition      1 5.2627e+12 9.0656e+14 528664
## + year           1 3.5695e+12 9.0825e+14 528704
## + month         11 4.0171e+12 9.0781e+14 528714
## + long           1 2.8458e+12 9.0898e+14 528721
## + sqft_lot15     1 1.9499e+12 9.0987e+14 528743
## + floors         1 1.7197e+12 9.1010e+14 528748
## + yr_renovated   1 1.1626e+12 9.1066e+14 528761
## + sqft_above     1 1.1004e+12 9.1072e+14 528763
## + sqft_basement  1 1.1004e+12 9.1072e+14 528763
## + sqft_living15  1 8.3834e+11 9.1098e+14 528769
## + sqft_lot       1 5.6135e+11 9.1126e+14 528776
## <none>                        9.1182e+14 528787
## 
## Step:  AIC=528574.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + zipcode        1 7.7118e+12 8.9510e+14 528391
## + condition      1 4.8311e+12 8.9798e+14 528460
## + year           1 3.7288e+12 8.9908e+14 528487
## + month         11 4.1427e+12 8.9867e+14 528497
## + long           1 2.0291e+12 9.0078e+14 528528
## + sqft_above     1 1.6201e+12 9.0119e+14 528538
## + sqft_basement  1 1.6201e+12 9.0119e+14 528538
## + sqft_living15  1 1.4832e+12 9.0133e+14 528541
## + sqft_lot15     1 1.4206e+12 9.0139e+14 528542
## + yr_renovated   1 4.2334e+11 9.0239e+14 528566
## + floors         1 3.9100e+11 9.0242e+14 528567
## + sqft_lot       1 3.6234e+11 9.0245e+14 528568
## <none>                        9.0281e+14 528574
## 
## Step:  AIC=528391
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode
## 
##                 Df  Sum of Sq        RSS    AIC
## + long           1 9.6955e+12 8.8540e+14 528158
## + year           1 3.7867e+12 8.9131e+14 528301
## + condition      1 3.5280e+12 8.9157e+14 528308
## + month         11 4.2023e+12 8.9090e+14 528311
## + sqft_lot15     1 2.1877e+12 8.9291e+14 528340
## + sqft_above     1 1.1628e+12 8.9394e+14 528365
## + sqft_basement  1 1.1628e+12 8.9394e+14 528365
## + floors         1 1.0460e+12 8.9405e+14 528368
## + sqft_lot       1 7.2300e+11 8.9438e+14 528376
## + sqft_living15  1 4.3262e+11 8.9467e+14 528383
## + yr_renovated   1 3.8775e+11 8.9471e+14 528384
## <none>                        8.9510e+14 528391
## 
## Step:  AIC=528157.6
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long
## 
##                 Df  Sum of Sq        RSS    AIC
## + year           1 3.8111e+12 8.8159e+14 528066
## + month         11 4.2814e+12 8.8112e+14 528075
## + condition      1 3.2700e+12 8.8213e+14 528080
## + sqft_above     1 2.8185e+12 8.8259e+14 528091
## + sqft_basement  1 2.8185e+12 8.8259e+14 528091
## + sqft_living15  1 1.5701e+12 8.8383e+14 528121
## + floors         1 8.8103e+11 8.8452e+14 528138
## + sqft_lot15     1 7.8011e+11 8.8462e+14 528141
## + yr_renovated   1 5.1267e+11 8.8489e+14 528147
## <none>                        8.8540e+14 528158
## + sqft_lot       1 8.0929e+10 8.8532e+14 528158
## 
## Step:  AIC=528066.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year
## 
##                 Df  Sum of Sq        RSS    AIC
## + condition      1 3.6274e+12 8.7797e+14 527979
## + sqft_above     1 2.8009e+12 8.7879e+14 528000
## + sqft_basement  1 2.8009e+12 8.7879e+14 528000
## + sqft_living15  1 1.5610e+12 8.8003e+14 528030
## + month         11 2.0269e+12 8.7957e+14 528039
## + floors         1 9.2843e+11 8.8067e+14 528046
## + sqft_lot15     1 7.8710e+11 8.8081e+14 528049
## + yr_renovated   1 5.6324e+11 8.8103e+14 528055
## + sqft_lot       1 8.9944e+10 8.8150e+14 528066
## <none>                        8.8159e+14 528066
## 
## Step:  AIC=527979.3
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_above     1 3.7222e+12 8.7424e+14 527889
## + sqft_basement  1 3.7222e+12 8.7424e+14 527889
## + sqft_living15  1 1.7359e+12 8.7623e+14 527938
## + floors         1 1.4085e+12 8.7656e+14 527947
## + yr_renovated   1 1.2248e+12 8.7674e+14 527951
## + month         11 2.0314e+12 8.7593e+14 527951
## + sqft_lot15     1 8.0771e+11 8.7716e+14 527961
## + sqft_lot       1 8.1878e+10 8.7788e+14 527979
## <none>                        8.7797e+14 527979
## 
## Step:  AIC=527889.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living15  1 1.2598e+12 8.7298e+14 527860
## + month         11 2.0638e+12 8.7218e+14 527860
## + yr_renovated   1 1.1838e+12 8.7306e+14 527862
## + sqft_lot15     1 9.0238e+11 8.7334e+14 527869
## + floors         1 1.5463e+11 8.7409e+14 527888
## + sqft_lot       1 1.1947e+11 8.7412e+14 527888
## <none>                        8.7424e+14 527889
## 
## Step:  AIC=527860.3
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15
## 
##                Df  Sum of Sq        RSS    AIC
## + yr_renovated  1 1.3372e+12 8.7165e+14 527829
## + month        11 2.0277e+12 8.7096e+14 527832
## + sqft_lot15    1 9.3286e+11 8.7205e+14 527839
## + floors        1 2.7522e+11 8.7271e+14 527855
## + sqft_lot      1 9.3474e+10 8.7289e+14 527860
## <none>                       8.7298e+14 527860
## 
## Step:  AIC=527829.2
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15 + yr_renovated
## 
##              Df  Sum of Sq        RSS    AIC
## + month      11 2.0209e+12 8.6963e+14 527801
## + sqft_lot15  1 9.3924e+11 8.7071e+14 527808
## + floors      1 2.1579e+11 8.7143e+14 527826
## + sqft_lot    1 8.7417e+10 8.7156e+14 527829
## <none>                     8.7165e+14 527829
## 
## Step:  AIC=527801
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15 + yr_renovated + month
## 
##              Df  Sum of Sq        RSS    AIC
## + sqft_lot15  1 9.7978e+11 8.6865e+14 527779
## + floors      1 1.9899e+11 8.6943e+14 527798
## + sqft_lot    1 8.6862e+10 8.6954e+14 527801
## <none>                     8.6963e+14 527801
## 
## Step:  AIC=527778.6
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15
## 
##            Df  Sum of Sq        RSS    AIC
## + sqft_lot  1 2.8792e+11 8.6836e+14 527773
## + floors    1 1.4315e+11 8.6850e+14 527777
## <none>                   8.6865e+14 527779
## 
## Step:  AIC=527773.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15 + 
##     sqft_lot
## 
##          Df  Sum of Sq        RSS    AIC
## + floors  1 1.5202e+11 8.6821e+14 527772
## <none>                 8.6836e+14 527773
## 
## Step:  AIC=527771.7
## price ~ sqft_living + lat + view + grade + yr_built + waterfront + 
##     bedrooms + bathrooms + zipcode + long + year + condition + 
##     sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15 + 
##     sqft_lot + floors
## 
##        Df Sum of Sq        RSS    AIC
## <none>              8.6821e+14 527772
## 
## Call:
## lm(formula = price ~ sqft_living + lat + view + grade + yr_built + 
##     waterfront + bedrooms + bathrooms + zipcode + long + year + 
##     condition + sqft_above + sqft_living15 + yr_renovated + month + 
##     sqft_lot15 + sqft_lot + floors, data = df)
## 
## Coefficients:
##   (Intercept)    sqft_living            lat           view          grade  
##     6.590e+06      1.503e+02      6.050e+05      5.267e+04      9.606e+04  
##      yr_built     waterfront       bedrooms      bathrooms        zipcode  
##    -2.614e+03      5.833e+05     -3.593e+04      4.130e+04     -5.842e+02  
##          long       year2015      condition     sqft_above  sqft_living15  
##    -2.153e+05      6.003e+04      2.771e+04      3.131e+01      2.149e+01  
##  yr_renovated        month02        month03        month04        month05  
##     2.076e+01      9.277e+03      3.508e+04      3.662e+04      5.431e+04  
##       month06        month07        month08        month09        month10  
##     6.139e+04      5.739e+04      5.974e+04      5.404e+04      6.151e+04  
##       month11        month12     sqft_lot15       sqft_lot         floors  
##     5.722e+04      5.190e+04     -3.905e-01      1.298e-01      6.969e+03

sqft_lot is an important predictor for the price of a home, but can you explain why lat (East-West) is such a high predictor for the model? Conversely, why do you think long is such a low predictor?

Answer: As we can see that the AIC value of lat(535838) is much lower then long which is why it is being taken as a high predictor. As soon as sqft_living + lat become a predictor in full model the long value goes very low.

My top 10 predictors for the linear mode are (sqft_living + lat + view + grade + yr_built + waterfront + bedrooms + bathrooms + zipcode + long) The R^2 for this model is 0.7051 which is not a good R^2 value but still better than the intial model. It’s telling that the predictors for this model are not efficient or relevant to predict the outcome price.

lm1<-lm(price~ (sqft_living:sqft_living15 + lat:long + view + grade + yr_built + waterfront + bedrooms + bathrooms + zipcode+year ),data=df)
summary(lm1)
## 
## Call:
## lm(formula = price ~ (sqft_living:sqft_living15 + lat:long + 
##     view + grade + yr_built + waterfront + bedrooms + bathrooms + 
##     zipcode + year), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1643523   -96600   -10003    75976  4161373 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.138e+07  2.813e+06  11.157   <2e-16 ***
## view                       4.567e+04  2.071e+03  22.053   <2e-16 ***
## grade                      9.641e+04  2.005e+03  48.083   <2e-16 ***
## yr_built                  -2.943e+03  6.098e+01 -48.265   <2e-16 ***
## waterfront                 6.039e+05  1.714e+04  35.230   <2e-16 ***
## bedrooms                  -1.820e+04  1.771e+03 -10.272   <2e-16 ***
## bathrooms                  7.358e+04  2.861e+03  25.717   <2e-16 ***
## zipcode                   -5.516e+02  2.987e+01 -18.466   <2e-16 ***
## year2015                   2.795e+04  2.904e+03   9.624   <2e-16 ***
## sqft_living:sqft_living15  4.089e-02  6.361e-04  64.291   <2e-16 ***
## lat:long                  -4.820e+03  8.249e+01 -58.429   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 199400 on 21602 degrees of freedom
## Multiple R-squared:  0.7051, Adjusted R-squared:  0.705 
## F-statistic:  5165 on 10 and 21602 DF,  p-value: < 2.2e-16

Convert the variable zipcode from numeric to a factor variable.The value of R^2 changed and the predictiors changed as well when the zipcode is converted into factor.

df$zipcode<-as.factor(df$zipcode)

Running model again to find top 10 predictors

null_model<-lm(price~1,data=df)
summary(null_model)
## 
## Call:
## lm(formula = price ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465088 -218138  -90088  104912 7159912 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   540088       2497   216.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
## 
## Call:
## lm(formula = price ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1164529   -69483       20    61316  4412767 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.523e+07  6.123e+06  -4.122 3.78e-05 ***
## bedrooms      -2.692e+04  1.527e+03 -17.626  < 2e-16 ***
## bathrooms      2.349e+04  2.617e+03   8.975  < 2e-16 ***
## sqft_living    1.300e+02  3.536e+00  36.771  < 2e-16 ***
## sqft_lot       2.450e-01  3.833e-02   6.392 1.67e-10 ***
## floors        -4.532e+04  3.152e+03 -14.376  < 2e-16 ***
## waterfront     6.575e+05  1.410e+04  46.638  < 2e-16 ***
## view           5.520e+04  1.750e+03  31.537  < 2e-16 ***
## condition      2.671e+04  1.925e+03  13.874  < 2e-16 ***
## grade          5.706e+04  1.811e+03  31.502  < 2e-16 ***
## sqft_above     7.929e+01  3.625e+00  21.871  < 2e-16 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -6.932e+02  6.477e+01 -10.703  < 2e-16 ***
## yr_renovated   1.842e+01  2.940e+00   6.267 3.75e-10 ***
## zipcode98002   3.258e+04  1.443e+04   2.258 0.023957 *  
## zipcode98003  -2.307e+04  1.290e+04  -1.789 0.073654 .  
## zipcode98004   7.170e+05  2.344e+04  30.595  < 2e-16 ***
## zipcode98005   2.479e+05  2.505e+04   9.896  < 2e-16 ***
## zipcode98006   2.280e+05  2.048e+04  11.133  < 2e-16 ***
## zipcode98007   1.947e+05  2.585e+04   7.532 5.21e-14 ***
## zipcode98008   2.048e+05  2.455e+04   8.342  < 2e-16 ***
## zipcode98010   9.673e+04  2.200e+04   4.397 1.10e-05 ***
## zipcode98011   3.566e+04  3.194e+04   1.116 0.264294    
## zipcode98014   7.903e+04  3.509e+04   2.252 0.024327 *  
## zipcode98019   3.806e+04  3.460e+04   1.100 0.271418    
## zipcode98022   4.270e+04  1.910e+04   2.235 0.025408 *  
## zipcode98023  -4.615e+04  1.186e+04  -3.890 0.000100 ***
## zipcode98024   1.493e+05  3.080e+04   4.846 1.27e-06 ***
## zipcode98027   1.516e+05  2.102e+04   7.214 5.61e-13 ***
## zipcode98028   2.760e+04  3.102e+04   0.890 0.373560    
## zipcode98029   1.916e+05  2.401e+04   7.980 1.53e-15 ***
## zipcode98030   7.133e+02  1.418e+04   0.050 0.959881    
## zipcode98031   2.944e+03  1.476e+04   0.199 0.841954    
## zipcode98032  -7.690e+03  1.715e+04  -0.448 0.653852    
## zipcode98033   2.929e+05  2.661e+04  11.007  < 2e-16 ***
## zipcode98034   1.222e+05  2.853e+04   4.284 1.84e-05 ***
## zipcode98038   4.671e+04  1.592e+04   2.934 0.003352 ** 
## zipcode98039   1.252e+06  3.168e+04  39.524  < 2e-16 ***
## zipcode98040   4.591e+05  2.073e+04  22.148  < 2e-16 ***
## zipcode98042   1.009e+04  1.357e+04   0.744 0.457092    
## zipcode98045   1.246e+05  2.941e+04   4.236 2.29e-05 ***
## zipcode98052   1.668e+05  2.717e+04   6.140 8.38e-10 ***
## zipcode98053   1.430e+05  2.910e+04   4.914 9.01e-07 ***
## zipcode98055   2.237e+04  1.645e+04   1.360 0.173967    
## zipcode98056   6.289e+04  1.787e+04   3.520 0.000433 ***
## zipcode98058   1.305e+04  1.554e+04   0.840 0.400776    
## zipcode98059   5.950e+04  1.753e+04   3.394 0.000689 ***
## zipcode98065   8.636e+04  2.711e+04   3.186 0.001446 ** 
## zipcode98070  -6.644e+04  2.068e+04  -3.212 0.001319 ** 
## zipcode98072   7.191e+04  3.177e+04   2.264 0.023611 *  
## zipcode98074   1.323e+05  2.573e+04   5.144 2.71e-07 ***
## zipcode98075   1.338e+05  2.474e+04   5.411 6.35e-08 ***
## zipcode98077   4.946e+04  3.306e+04   1.496 0.134611    
## zipcode98092  -2.484e+04  1.290e+04  -1.926 0.054079 .  
## zipcode98102   4.432e+05  2.743e+04  16.159  < 2e-16 ***
## zipcode98103   2.565e+05  2.574e+04   9.966  < 2e-16 ***
## zipcode98105   3.920e+05  2.643e+04  14.834  < 2e-16 ***
## zipcode98106   9.073e+04  1.907e+04   4.758 1.96e-06 ***
## zipcode98107   2.614e+05  2.653e+04   9.851  < 2e-16 ***
## zipcode98108   7.564e+04  2.105e+04   3.593 0.000328 ***
## zipcode98109   4.197e+05  2.734e+04  15.352  < 2e-16 ***
## zipcode98112   5.511e+05  2.426e+04  22.721  < 2e-16 ***
## zipcode98115   2.495e+05  2.616e+04   9.537  < 2e-16 ***
## zipcode98116   2.200e+05  2.129e+04  10.334  < 2e-16 ***
## zipcode98117   2.275e+05  2.649e+04   8.589  < 2e-16 ***
## zipcode98118   1.232e+05  1.859e+04   6.628 3.48e-11 ***
## zipcode98119   3.983e+05  2.582e+04  15.427  < 2e-16 ***
## zipcode98122   2.749e+05  2.303e+04  11.935  < 2e-16 ***
## zipcode98125   1.127e+05  2.825e+04   3.990 6.63e-05 ***
## zipcode98126   1.349e+05  1.955e+04   6.901 5.30e-12 ***
## zipcode98133   6.862e+04  2.917e+04   2.353 0.018649 *  
## zipcode98136   1.845e+05  2.004e+04   9.206  < 2e-16 ***
## zipcode98144   2.191e+05  2.141e+04  10.234  < 2e-16 ***
## zipcode98146   5.762e+04  1.789e+04   3.220 0.001284 ** 
## zipcode98148   3.645e+04  2.435e+04   1.497 0.134445    
## zipcode98155   4.880e+04  3.034e+04   1.609 0.107714    
## zipcode98166   1.441e+04  1.638e+04   0.880 0.379120    
## zipcode98168   3.908e+04  1.731e+04   2.258 0.023942 *  
## zipcode98177   1.142e+05  3.045e+04   3.750 0.000177 ***
## zipcode98178   5.744e+03  1.788e+04   0.321 0.748000    
## zipcode98188   6.447e+03  1.835e+04   0.351 0.725335    
## zipcode98198  -2.370e+04  1.390e+04  -1.704 0.088310 .  
## zipcode98199   2.995e+05  2.515e+04  11.908  < 2e-16 ***
## lat            2.138e+05  6.321e+04   3.382 0.000721 ***
## long          -1.301e+05  4.540e+04  -2.865 0.004173 ** 
## sqft_living15  1.033e+01  2.881e+00   3.586 0.000336 ***
## sqft_lot15    -1.327e-01  6.033e-02  -2.199 0.027874 *  
## year2015       6.205e+04  7.391e+03   8.395  < 2e-16 ***
## month02        6.357e+03  6.856e+03   0.927 0.353807    
## month03        2.675e+04  6.332e+03   4.224 2.41e-05 ***
## month04        3.252e+04  6.159e+03   5.280 1.30e-07 ***
## month05        4.865e+04  8.145e+03   5.972 2.38e-09 ***
## month06        6.062e+04  9.633e+03   6.293 3.17e-10 ***
## month07        5.426e+04  9.625e+03   5.637 1.75e-08 ***
## month08        5.784e+04  9.710e+03   5.957 2.61e-09 ***
## month09        5.176e+04  9.769e+03   5.299 1.18e-07 ***
## month10        5.443e+04  9.734e+03   5.592 2.28e-08 ***
## month11        5.545e+04  9.965e+03   5.564 2.66e-08 ***
## month12        5.673e+04  9.925e+03   5.716 1.10e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160200 on 21515 degrees of freedom
## Multiple R-squared:  0.8103, Adjusted R-squared:  0.8095 
## F-statistic: 947.7 on 97 and 21515 DF,  p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start:  AIC=553875.8
## price ~ 1
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living    1 1.4356e+15 1.4773e+15 539204
## + grade          1 1.2976e+15 1.6153e+15 541134
## + zipcode       69 1.1867e+15 1.7262e+15 542706
## + sqft_above     1 1.0682e+15 1.8447e+15 544004
## + sqft_living15  1 9.9816e+14 1.9148e+15 544810
## + bathrooms      1 8.0329e+14 2.1096e+15 546904
## + view           1 4.5978e+14 2.4531e+15 550165
## + sqft_basement  1 3.0544e+14 2.6075e+15 551484
## + bedrooms       1 2.7696e+14 2.6360e+15 551718
## + lat            1 2.7455e+14 2.6384e+15 551738
## + waterfront     1 2.0668e+14 2.7062e+15 552287
## + floors         1 1.9209e+14 2.7208e+15 552403
## + yr_renovated   1 4.6564e+13 2.8664e+15 553529
## + sqft_lot       1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15     1 1.9801e+13 2.8931e+15 553730
## + yr_built       1 8.4977e+12 2.9044e+15 553815
## + condition      1 3.8514e+12 2.9091e+15 553849
## + month         11 4.6632e+12 2.9083e+15 553863
## + long           1 1.3624e+12 2.9116e+15 553868
## <none>                        2.9129e+15 553876
## + year           1 3.7251e+10 2.9129e+15 553878
## 
## Step:  AIC=539203.5
## price ~ sqft_living
## 
##                 Df  Sum of Sq        RSS    AIC
## + zipcode       69 6.9104e+14 7.8624e+14 525710
## + lat            1 2.1314e+14 1.2641e+15 535838
## + view           1 1.2362e+14 1.3537e+15 537317
## + grade          1 1.2132e+14 1.3560e+15 537353
## + waterfront     1 1.1024e+14 1.3670e+15 537529
## + yr_built       1 9.2854e+13 1.3844e+15 537802
## + long           1 6.6817e+13 1.4105e+15 538205
## + bedrooms       1 4.0635e+13 1.4366e+15 538603
## + yr_renovated   1 2.2405e+13 1.4549e+15 538875
## + sqft_living15  1 2.0109e+13 1.4572e+15 538909
## + condition      1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15     1 6.4407e+12 1.4708e+15 539111
## + sqft_lot       1 3.0113e+12 1.4743e+15 539161
## + month         11 4.0868e+12 1.4732e+15 539166
## + year           1 1.6739e+12 1.4756e+15 539181
## + sqft_above     1 1.2165e+12 1.4761e+15 539188
## + sqft_basement  1 1.2165e+12 1.4761e+15 539188
## + floors         1 2.2991e+11 1.4770e+15 539202
## + bathrooms      1 1.4719e+11 1.4771e+15 539203
## <none>                        1.4773e+15 539204
## 
## Step:  AIC=525710.2
## price ~ sqft_living + zipcode
## 
##                 Df  Sum of Sq        RSS    AIC
## + waterfront     1 1.1577e+14 6.7047e+14 522270
## + view           1 9.4436e+13 6.9180e+14 522947
## + grade          1 4.2251e+13 7.4398e+14 524518
## + bedrooms       1 2.4026e+13 7.6221e+14 525041
## + sqft_living15  1 1.6595e+13 7.6964e+14 525251
## + sqft_above     1 8.9744e+12 7.7726e+14 525464
## + sqft_basement  1 8.9744e+12 7.7726e+14 525464
## + yr_renovated   1 4.6850e+12 7.8155e+14 525583
## + year           1 3.9193e+12 7.8232e+14 525604
## + condition      1 3.8480e+12 7.8239e+14 525606
## + yr_built       1 3.6430e+12 7.8259e+14 525612
## + month         11 3.9947e+12 7.8224e+14 525622
## + sqft_lot       1 2.9632e+12 7.8327e+14 525631
## + sqft_lot15     1 1.2435e+12 7.8499e+14 525678
## + long           1 7.0269e+11 7.8553e+14 525693
## + floors         1 5.5501e+11 7.8568e+14 525697
## + lat            1 1.1386e+11 7.8612e+14 525709
## <none>                        7.8624e+14 525710
## + bathrooms      1 9.9318e+09 7.8623e+14 525712
## 
## Step:  AIC=522269.7
## price ~ sqft_living + zipcode + waterfront
## 
##                 Df  Sum of Sq        RSS    AIC
## + grade          1 3.9096e+13 6.3137e+14 520973
## + view           1 3.7180e+13 6.3329e+14 521039
## + bedrooms       1 1.6264e+13 6.5420e+14 521741
## + sqft_living15  1 1.3958e+13 6.5651e+14 521817
## + sqft_above     1 1.0767e+13 6.5970e+14 521922
## + sqft_basement  1 1.0767e+13 6.5970e+14 521922
## + year           1 3.8981e+12 6.6657e+14 522146
## + sqft_lot       1 3.5114e+12 6.6696e+14 522158
## + condition      1 3.3423e+12 6.6713e+14 522164
## + month         11 3.9016e+12 6.6657e+14 522166
## + yr_built       1 1.9059e+12 6.6856e+14 522210
## + yr_renovated   1 1.6654e+12 6.6880e+14 522218
## + sqft_lot15     1 1.5408e+12 6.6893e+14 522222
## + floors         1 6.3045e+11 6.6984e+14 522251
## + lat            1 4.0376e+11 6.7006e+14 522259
## + long           1 2.3133e+11 6.7024e+14 522264
## <none>                        6.7047e+14 522270
## + bathrooms      1 3.8057e+06 6.7047e+14 522272
## 
## Step:  AIC=520973.2
## price ~ sqft_living + zipcode + waterfront + grade
## 
##                 Df  Sum of Sq        RSS    AIC
## + view           1 3.1697e+13 5.9967e+14 519862
## + yr_built       1 1.8348e+13 6.1302e+14 520338
## + bedrooms       1 9.8149e+12 6.2156e+14 520637
## + condition      1 8.7967e+12 6.2257e+14 520672
## + floors         1 7.9792e+12 6.2339e+14 520700
## + sqft_living15  1 4.2135e+12 6.2716e+14 520830
## + year           1 3.9998e+12 6.2737e+14 520838
## + sqft_lot       1 3.5158e+12 6.2786e+14 520854
## + month         11 3.7544e+12 6.2762e+14 520866
## + yr_renovated   1 2.5487e+12 6.2882e+14 520888
## + sqft_above     1 2.1336e+12 6.2924e+14 520902
## + sqft_basement  1 2.1336e+12 6.2924e+14 520902
## + sqft_lot15     1 1.6165e+12 6.2976e+14 520920
## + bathrooms      1 1.6091e+12 6.2976e+14 520920
## + lat            1 2.7263e+11 6.3110e+14 520966
## + long           1 1.4112e+11 6.3123e+14 520970
## <none>                        6.3137e+14 520973
## 
## Step:  AIC=519861.9
## price ~ sqft_living + zipcode + waterfront + grade + view
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_built       1 1.3437e+13 5.8624e+14 519374
## + bedrooms       1 7.6258e+12 5.9205e+14 519587
## + condition      1 7.1374e+12 5.9254e+14 519605
## + sqft_above     1 5.4833e+12 5.9419e+14 519665
## + sqft_basement  1 5.4833e+12 5.9419e+14 519665
## + floors         1 5.2288e+12 5.9445e+14 519675
## + year           1 3.7044e+12 5.9597e+14 519730
## + month         11 3.5595e+12 5.9611e+14 519755
## + sqft_lot       1 2.6659e+12 5.9701e+14 519768
## + yr_renovated   1 1.8495e+12 5.9782e+14 519797
## + sqft_living15  1 1.2617e+12 5.9841e+14 519818
## + bathrooms      1 1.1539e+12 5.9852e+14 519822
## + sqft_lot15     1 1.1022e+12 5.9857e+14 519824
## + lat            1 3.4922e+11 5.9933e+14 519851
## + long           1 9.9357e+10 5.9957e+14 519860
## <none>                        5.9967e+14 519862
## 
## Step:  AIC=519374.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built
## 
##                 Df  Sum of Sq        RSS    AIC
## + bedrooms       1 7.7652e+12 5.7847e+14 519088
## + sqft_above     1 6.8761e+12 5.7936e+14 519121
## + sqft_basement  1 6.8761e+12 5.7936e+14 519121
## + year           1 3.8284e+12 5.8241e+14 519235
## + month         11 3.6431e+12 5.8259e+14 519261
## + condition      1 2.4804e+12 5.8376e+14 519285
## + sqft_lot       1 1.7107e+12 5.8453e+14 519313
## + sqft_living15  1 7.2759e+11 5.8551e+14 519349
## + floors         1 5.9549e+11 5.8564e+14 519354
## + sqft_lot15     1 5.5026e+11 5.8569e+14 519356
## + lat            1 3.2692e+11 5.8591e+14 519364
## + yr_renovated   1 3.1372e+11 5.8592e+14 519365
## + bathrooms      1 2.1204e+11 5.8603e+14 519368
## + long           1 7.6113e+10 5.8616e+14 519373
## <none>                        5.8624e+14 519374
## 
## Step:  AIC=519088
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_above     1 6.3370e+12 5.7214e+14 518852
## + sqft_basement  1 6.3370e+12 5.7214e+14 518852
## + year           1 3.8895e+12 5.7458e+14 518944
## + month         11 3.7727e+12 5.7470e+14 518969
## + condition      1 2.9190e+12 5.7555e+14 518981
## + sqft_lot       1 1.2696e+12 5.7720e+14 519042
## + bathrooms      1 1.1447e+12 5.7733e+14 519047
## + sqft_living15  1 5.6359e+11 5.7791e+14 519069
## + floors         1 4.7047e+11 5.7800e+14 519072
## + lat            1 2.9843e+11 5.7817e+14 519079
## + yr_renovated   1 2.8851e+11 5.7818e+14 519079
## + sqft_lot15     1 2.8621e+11 5.7819e+14 519079
## + long           1 1.0575e+11 5.7837e+14 519086
## <none>                        5.7847e+14 519088
## 
## Step:  AIC=518851.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above
## 
##                 Df  Sum of Sq        RSS    AIC
## + floors         1 4.4329e+12 5.6770e+14 518686
## + condition      1 4.2128e+12 5.6792e+14 518694
## + year           1 3.8676e+12 5.6827e+14 518707
## + month         11 3.8227e+12 5.6831e+14 518729
## + bathrooms      1 1.3994e+12 5.7074e+14 518801
## + sqft_lot       1 1.1648e+12 5.7097e+14 518810
## + lat            1 3.4475e+11 5.7179e+14 518841
## + sqft_lot15     1 2.5962e+11 5.7188e+14 518844
## + yr_renovated   1 2.3738e+11 5.7190e+14 518845
## + sqft_living15  1 2.3714e+11 5.7190e+14 518845
## + long           1 1.4851e+11 5.7199e+14 518848
## <none>                        5.7214e+14 518852
## 
## Step:  AIC=518685.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors
## 
##                 Df  Sum of Sq        RSS    AIC
## + condition      1 4.0092e+12 5.6369e+14 518535
## + year           1 3.7412e+12 5.6396e+14 518545
## + month         11 3.7135e+12 5.6399e+14 518566
## + bathrooms      1 2.7803e+12 5.6492e+14 518582
## + sqft_lot       1 9.4701e+11 5.6676e+14 518652
## + yr_renovated   1 5.4811e+11 5.6715e+14 518667
## + lat            1 2.7934e+11 5.6742e+14 518677
## + long           1 1.8903e+11 5.6751e+14 518681
## + sqft_lot15     1 1.4587e+11 5.6756e+14 518682
## + sqft_living15  1 1.2035e+11 5.6758e+14 518683
## <none>                        5.6770e+14 518686
## 
## Step:  AIC=518534.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition
## 
##                 Df  Sum of Sq        RSS    AIC
## + year           1 4.1601e+12 5.5953e+14 518376
## + month         11 4.0530e+12 5.5964e+14 518401
## + bathrooms      1 2.4411e+12 5.6125e+14 518443
## + yr_renovated   1 1.2094e+12 5.6248e+14 518490
## + sqft_lot       1 9.9034e+11 5.6270e+14 518499
## + lat            1 3.5099e+11 5.6334e+14 518523
## + sqft_living15  1 2.0048e+11 5.6349e+14 518529
## + long           1 1.8946e+11 5.6350e+14 518529
## + sqft_lot15     1 1.4721e+11 5.6355e+14 518531
## <none>                        5.6369e+14 518535
## 
## Step:  AIC=518376.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year
## 
##                 Df  Sum of Sq        RSS    AIC
## + bathrooms      1 2.4807e+12 5.5705e+14 518282
## + yr_renovated   1 1.3452e+12 5.5819e+14 518326
## + month         11 1.6499e+12 5.5788e+14 518335
## + sqft_lot       1 9.4997e+11 5.5858e+14 518342
## + lat            1 3.2476e+11 5.5921e+14 518366
## + sqft_living15  1 2.0954e+11 5.5932e+14 518370
## + long           1 1.9735e+11 5.5934e+14 518371
## + sqft_lot15     1 1.3936e+11 5.5939e+14 518373
## <none>                        5.5953e+14 518376
## 
## Step:  AIC=518282.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + month         11 1.6756e+12 5.5538e+14 518239
## + sqft_lot       1 9.7066e+11 5.5608e+14 518247
## + yr_renovated   1 8.9857e+11 5.5615e+14 518250
## + lat            1 3.2531e+11 5.5673e+14 518272
## + sqft_living15  1 2.5318e+11 5.5680e+14 518275
## + long           1 1.8705e+11 5.5687e+14 518277
## + sqft_lot15     1 1.6583e+11 5.5689e+14 518278
## <none>                        5.5705e+14 518282
## 
## Step:  AIC=518239.3
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_lot       1 9.7162e+11 5.5441e+14 518203
## + yr_renovated   1 9.0953e+11 5.5447e+14 518206
## + lat            1 3.2891e+11 5.5505e+14 518229
## + sqft_living15  1 2.3378e+11 5.5514e+14 518232
## + long           1 1.8732e+11 5.5519e+14 518234
## + sqft_lot15     1 1.4760e+11 5.5523e+14 518236
## <none>                        5.5538e+14 518239
## 
## Step:  AIC=518203.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_renovated   1 9.2987e+11 5.5348e+14 518169
## + lat            1 3.4940e+11 5.5406e+14 518192
## + long           1 2.5487e+11 5.5415e+14 518196
## + sqft_living15  1 2.4884e+11 5.5416e+14 518196
## + sqft_lot15     1 1.1490e+11 5.5429e+14 518201
## <none>                        5.5441e+14 518203
## 
## Step:  AIC=518169.2
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot + yr_renovated
## 
##                 Df  Sum of Sq        RSS    AIC
## + lat            1 3.3998e+11 5.5314e+14 518158
## + sqft_living15  1 3.1724e+11 5.5316e+14 518159
## + long           1 2.7104e+11 5.5320e+14 518161
## + sqft_lot15     1 1.1779e+11 5.5336e+14 518167
## <none>                        5.5348e+14 518169
## 
## Step:  AIC=518157.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot + yr_renovated + lat
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living15  1 3.0922e+11 5.5283e+14 518148
## + long           1 2.3146e+11 5.5290e+14 518151
## + sqft_lot15     1 1.1724e+11 5.5302e+14 518155
## <none>                        5.5314e+14 518158
## 
## Step:  AIC=518147.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot + yr_renovated + lat + sqft_living15
## 
##              Df  Sum of Sq        RSS    AIC
## + long        1 2.2901e+11 5.5260e+14 518141
## + sqft_lot15  1 1.4242e+11 5.5268e+14 518144
## <none>                     5.5283e+14 518148
## 
## Step:  AIC=518140.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot + yr_renovated + lat + sqft_living15 + long
## 
##              Df  Sum of Sq        RSS    AIC
## + sqft_lot15  1 1.2419e+11 5.5247e+14 518138
## <none>                     5.5260e+14 518141
## 
## Step:  AIC=518138
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + floors + condition + year + bathrooms + 
##     month + sqft_lot + yr_renovated + lat + sqft_living15 + long + 
##     sqft_lot15
## 
##        Df Sum of Sq        RSS    AIC
## <none>              5.5247e+14 518138
## 
## Call:
## lm(formula = price ~ sqft_living + zipcode + waterfront + grade + 
##     view + yr_built + bedrooms + sqft_above + floors + condition + 
##     year + bathrooms + month + sqft_lot + yr_renovated + lat + 
##     sqft_living15 + long + sqft_lot15, data = df)
## 
## Coefficients:
##   (Intercept)    sqft_living   zipcode98002   zipcode98003   zipcode98004  
##    -2.523e+07      1.300e+02      3.258e+04     -2.307e+04      7.170e+05  
##  zipcode98005   zipcode98006   zipcode98007   zipcode98008   zipcode98010  
##     2.479e+05      2.280e+05      1.947e+05      2.048e+05      9.673e+04  
##  zipcode98011   zipcode98014   zipcode98019   zipcode98022   zipcode98023  
##     3.566e+04      7.903e+04      3.806e+04      4.270e+04     -4.615e+04  
##  zipcode98024   zipcode98027   zipcode98028   zipcode98029   zipcode98030  
##     1.493e+05      1.516e+05      2.760e+04      1.916e+05      7.133e+02  
##  zipcode98031   zipcode98032   zipcode98033   zipcode98034   zipcode98038  
##     2.944e+03     -7.690e+03      2.929e+05      1.222e+05      4.671e+04  
##  zipcode98039   zipcode98040   zipcode98042   zipcode98045   zipcode98052  
##     1.252e+06      4.591e+05      1.009e+04      1.246e+05      1.668e+05  
##  zipcode98053   zipcode98055   zipcode98056   zipcode98058   zipcode98059  
##     1.430e+05      2.237e+04      6.289e+04      1.305e+04      5.950e+04  
##  zipcode98065   zipcode98070   zipcode98072   zipcode98074   zipcode98075  
##     8.636e+04     -6.644e+04      7.191e+04      1.323e+05      1.338e+05  
##  zipcode98077   zipcode98092   zipcode98102   zipcode98103   zipcode98105  
##     4.946e+04     -2.484e+04      4.432e+05      2.565e+05      3.920e+05  
##  zipcode98106   zipcode98107   zipcode98108   zipcode98109   zipcode98112  
##     9.073e+04      2.614e+05      7.564e+04      4.197e+05      5.511e+05  
##  zipcode98115   zipcode98116   zipcode98117   zipcode98118   zipcode98119  
##     2.495e+05      2.200e+05      2.275e+05      1.232e+05      3.983e+05  
##  zipcode98122   zipcode98125   zipcode98126   zipcode98133   zipcode98136  
##     2.749e+05      1.127e+05      1.349e+05      6.862e+04      1.845e+05  
##  zipcode98144   zipcode98146   zipcode98148   zipcode98155   zipcode98166  
##     2.191e+05      5.762e+04      3.645e+04      4.880e+04      1.441e+04  
##  zipcode98168   zipcode98177   zipcode98178   zipcode98188   zipcode98198  
##     3.908e+04      1.142e+05      5.744e+03      6.447e+03     -2.370e+04  
##  zipcode98199     waterfront          grade           view       yr_built  
##     2.995e+05      6.575e+05      5.706e+04      5.520e+04     -6.932e+02  
##      bedrooms     sqft_above         floors      condition       year2015  
##    -2.692e+04      7.929e+01     -4.532e+04      2.671e+04      6.205e+04  
##     bathrooms        month02        month03        month04        month05  
##     2.349e+04      6.357e+03      2.675e+04      3.252e+04      4.865e+04  
##       month06        month07        month08        month09        month10  
##     6.062e+04      5.426e+04      5.784e+04      5.176e+04      5.443e+04  
##       month11        month12       sqft_lot   yr_renovated            lat  
##     5.545e+04      5.673e+04      2.450e-01      1.842e+01      2.138e+05  
## sqft_living15           long     sqft_lot15  
##     1.033e+01     -1.301e+05     -1.327e-01

linear regression model 2

lm2<-lm(price~ (sqft_living + zipcode + waterfront + grade + view + yr_built + 
                  bedrooms + sqft_above + floors + condition),data=df)
summary(lm2)
## 
## Call:
## lm(formula = price ~ (sqft_living + zipcode + waterfront + grade + 
##     view + yr_built + bedrooms + sqft_above + floors + condition), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1119069   -71685     -430    62524  4427499 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.175e+05  1.140e+05   7.170 7.72e-13 ***
## sqft_living   1.471e+02  3.182e+00  46.231  < 2e-16 ***
## zipcode98002  2.692e+04  1.431e+04   1.881 0.059999 .  
## zipcode98003 -1.648e+04  1.289e+04  -1.278 0.201136    
## zipcode98004  7.780e+05  1.261e+04  61.718  < 2e-16 ***
## zipcode98005  3.032e+05  1.524e+04  19.900  < 2e-16 ***
## zipcode98006  2.671e+05  1.136e+04  23.510  < 2e-16 ***
## zipcode98007  2.429e+05  1.612e+04  15.067  < 2e-16 ***
## zipcode98008  2.486e+05  1.290e+04  19.267  < 2e-16 ***
## zipcode98010  7.487e+04  1.829e+04   4.093 4.28e-05 ***
## zipcode98011  1.242e+05  1.439e+04   8.630  < 2e-16 ***
## zipcode98014  1.107e+05  1.686e+04   6.564 5.35e-11 ***
## zipcode98019  9.650e+04  1.452e+04   6.647 3.07e-11 ***
## zipcode98022 -2.359e+03  1.363e+04  -0.173 0.862616    
## zipcode98023 -3.485e+04  1.119e+04  -3.116 0.001836 ** 
## zipcode98024  1.728e+05  1.992e+04   8.672  < 2e-16 ***
## zipcode98027  1.722e+05  1.172e+04  14.693  < 2e-16 ***
## zipcode98028  1.206e+05  1.285e+04   9.386  < 2e-16 ***
## zipcode98029  2.136e+05  1.250e+04  17.096  < 2e-16 ***
## zipcode98030  7.143e+03  1.322e+04   0.540 0.588865    
## zipcode98031  1.669e+04  1.296e+04   1.287 0.197954    
## zipcode98032  1.370e+03  1.681e+04   0.081 0.935064    
## zipcode98033  3.640e+05  1.158e+04  31.447  < 2e-16 ***
## zipcode98034  2.020e+05  1.099e+04  18.383  < 2e-16 ***
## zipcode98038  3.768e+04  1.083e+04   3.478 0.000506 ***
## zipcode98039  1.325e+06  2.462e+04  53.830  < 2e-16 ***
## zipcode98040  5.136e+05  1.308e+04  39.264  < 2e-16 ***
## zipcode98042  8.091e+03  1.097e+04   0.737 0.460934    
## zipcode98045  9.646e+04  1.383e+04   6.977 3.10e-12 ***
## zipcode98052  2.273e+05  1.091e+04  20.831  < 2e-16 ***
## zipcode98053  1.943e+05  1.182e+04  16.438  < 2e-16 ***
## zipcode98055  4.754e+04  1.305e+04   3.642 0.000272 ***
## zipcode98056  9.562e+04  1.172e+04   8.161 3.49e-16 ***
## zipcode98058  3.017e+04  1.141e+04   2.645 0.008187 ** 
## zipcode98059  8.482e+04  1.136e+04   7.464 8.69e-14 ***
## zipcode98065  8.727e+04  1.260e+04   6.928 4.40e-12 ***
## zipcode98070 -7.681e+03  1.739e+04  -0.442 0.658794    
## zipcode98072  1.530e+05  1.301e+04  11.758  < 2e-16 ***
## zipcode98074  1.705e+05  1.159e+04  14.711  < 2e-16 ***
## zipcode98075  1.660e+05  1.221e+04  13.591  < 2e-16 ***
## zipcode98077  1.212e+05  1.443e+04   8.400  < 2e-16 ***
## zipcode98092 -3.477e+04  1.214e+04  -2.865 0.004178 ** 
## zipcode98102  5.141e+05  1.830e+04  28.085  < 2e-16 ***
## zipcode98103  3.411e+05  1.113e+04  30.646  < 2e-16 ***
## zipcode98105  4.716e+05  1.400e+04  33.697  < 2e-16 ***
## zipcode98106  1.504e+05  1.234e+04  12.189  < 2e-16 ***
## zipcode98107  3.522e+05  1.335e+04  26.395  < 2e-16 ***
## zipcode98108  1.294e+05  1.469e+04   8.808  < 2e-16 ***
## zipcode98109  4.965e+05  1.799e+04  27.593  < 2e-16 ***
## zipcode98112  6.226e+05  1.351e+04  46.092  < 2e-16 ***
## zipcode98115  3.314e+05  1.106e+04  29.980  < 2e-16 ***
## zipcode98116  2.928e+05  1.255e+04  23.326  < 2e-16 ***
## zipcode98117  3.178e+05  1.119e+04  28.414  < 2e-16 ***
## zipcode98118  1.733e+05  1.129e+04  15.352  < 2e-16 ***
## zipcode98119  4.819e+05  1.504e+04  32.046  < 2e-16 ***
## zipcode98122  3.427e+05  1.312e+04  26.127  < 2e-16 ***
## zipcode98125  2.007e+05  1.175e+04  17.078  < 2e-16 ***
## zipcode98126  1.939e+05  1.224e+04  15.835  < 2e-16 ***
## zipcode98133  1.666e+05  1.126e+04  14.794  < 2e-16 ***
## zipcode98136  2.482e+05  1.329e+04  18.686  < 2e-16 ***
## zipcode98144  2.821e+05  1.242e+04  22.706  < 2e-16 ***
## zipcode98146  1.088e+05  1.285e+04   8.468  < 2e-16 ***
## zipcode98148  7.140e+04  2.308e+04   3.093 0.001983 ** 
## zipcode98155  1.468e+05  1.149e+04  12.772  < 2e-16 ***
## zipcode98166  5.472e+04  1.333e+04   4.106 4.03e-05 ***
## zipcode98168  7.563e+04  1.311e+04   5.767 8.20e-09 ***
## zipcode98177  2.172e+05  1.336e+04  16.263  < 2e-16 ***
## zipcode98178  3.823e+04  1.322e+04   2.893 0.003824 ** 
## zipcode98188  3.635e+04  1.629e+04   2.231 0.025680 *  
## zipcode98198 -1.398e+03  1.292e+04  -0.108 0.913827    
## zipcode98199  3.883e+05  1.269e+04  30.597  < 2e-16 ***
## waterfront    6.587e+05  1.418e+04  46.438  < 2e-16 ***
## grade         6.039e+04  1.767e+03  34.180  < 2e-16 ***
## view          5.721e+04  1.738e+03  32.919  < 2e-16 ***
## yr_built     -6.724e+02  5.875e+01 -11.444  < 2e-16 ***
## bedrooms     -2.470e+04  1.506e+03 -16.401  < 2e-16 ***
## sqft_above    7.558e+01  3.597e+00  21.014  < 2e-16 ***
## floors       -3.923e+04  3.086e+03 -12.711  < 2e-16 ***
## condition     2.352e+04  1.901e+03  12.376  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 161800 on 21534 degrees of freedom
## Multiple R-squared:  0.8065, Adjusted R-squared:  0.8058 
## F-statistic:  1151 on 78 and 21534 DF,  p-value: < 2.2e-16

The zipcode now becomes the one of the top 10 predictors because its not numeric but a factor variable now.

The value of R^2 is 0.806 which shows that the model is somewhat efficient and can be further improved by finding out more imortant predictors than the current ones. Most of the predictors in the model are now highly siginificat

Does the model suffer from heteroskedesticity? (Use bptest in the lmtest package in R. In Python, use het_breuschpagan test in the statsmodels package)

Answer: Yes the model suffers from heteroskedesticity since the value is less than 0.05

lmtest::bptest(lm2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm2
## BP = 2884, df = 78, p-value < 2.2e-16

Are there nonlinearities in the model? (Use the plots to discern this) Answer: Yes there are nonlinerities in the model. The trend lines are horizontal rather than vertical which is an indication of non linearities.

Are the residuals normally distributed? Answer: No the residuals are not normally distributed as we can see from the plot that values are not properely distributed.

par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(lm2)


Section 3 - Extra Credit

df$bedrooms<-as.factor(df$bedrooms)
null_model<-lm(price~1,data=df)
summary(null_model)
## 
## Call:
## lm(formula = price ~ 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465088 -218138  -90088  104912 7159912 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   540088       2497   216.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
## 
## Call:
## lm(formula = price ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1166293   -69414     -433    61664  4377522 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.517e+07  6.115e+06  -4.116 3.86e-05 ***
## bedrooms1     -3.878e+04  4.606e+04  -0.842 0.399823    
## bedrooms2     -8.181e+04  4.473e+04  -1.829 0.067438 .  
## bedrooms3     -1.132e+05  4.468e+04  -2.533 0.011327 *  
## bedrooms4     -1.499e+05  4.473e+04  -3.352 0.000803 ***
## bedrooms5     -1.608e+05  4.493e+04  -3.578 0.000347 ***
## bedrooms6     -1.756e+05  4.588e+04  -3.827 0.000130 ***
## bedrooms7     -2.787e+05  5.199e+04  -5.360 8.39e-08 ***
## bedrooms8     -1.483e+05  6.332e+04  -2.342 0.019196 *  
## bedrooms9     -3.662e+05  7.978e+04  -4.590 4.46e-06 ***
## bedrooms10    -4.744e+05  1.029e+05  -4.610 4.05e-06 ***
## bedrooms11    -2.723e+05  1.666e+05  -1.635 0.102116    
## bedrooms33    -6.439e+04  1.663e+05  -0.387 0.698706    
## bathrooms      2.424e+04  2.629e+03   9.220  < 2e-16 ***
## sqft_living    1.305e+02  3.558e+00  36.673  < 2e-16 ***
## sqft_lot       2.422e-01  3.829e-02   6.325 2.58e-10 ***
## floors        -4.471e+04  3.153e+03 -14.181  < 2e-16 ***
## waterfront     6.547e+05  1.408e+04  46.488  < 2e-16 ***
## view           5.477e+04  1.750e+03  31.307  < 2e-16 ***
## condition      2.702e+04  1.928e+03  14.009  < 2e-16 ***
## grade          5.746e+04  1.823e+03  31.515  < 2e-16 ***
## sqft_above     7.994e+01  3.625e+00  22.053  < 2e-16 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -7.051e+02  6.488e+01 -10.869  < 2e-16 ***
## yr_renovated   1.839e+01  2.938e+00   6.260 3.93e-10 ***
## zipcode98002   3.135e+04  1.441e+04   2.175 0.029607 *  
## zipcode98003  -2.400e+04  1.288e+04  -1.863 0.062409 .  
## zipcode98004   7.159e+05  2.342e+04  30.573  < 2e-16 ***
## zipcode98005   2.477e+05  2.502e+04   9.899  < 2e-16 ***
## zipcode98006   2.278e+05  2.046e+04  11.133  < 2e-16 ***
## zipcode98007   1.950e+05  2.583e+04   7.549 4.57e-14 ***
## zipcode98008   2.055e+05  2.453e+04   8.379  < 2e-16 ***
## zipcode98010   9.688e+04  2.197e+04   4.410 1.04e-05 ***
## zipcode98011   3.540e+04  3.191e+04   1.110 0.267219    
## zipcode98014   7.830e+04  3.505e+04   2.234 0.025481 *  
## zipcode98019   3.875e+04  3.456e+04   1.121 0.262272    
## zipcode98022   4.171e+04  1.908e+04   2.186 0.028796 *  
## zipcode98023  -4.618e+04  1.185e+04  -3.898 9.71e-05 ***
## zipcode98024   1.483e+05  3.078e+04   4.819 1.46e-06 ***
## zipcode98027   1.512e+05  2.100e+04   7.199 6.26e-13 ***
## zipcode98028   2.876e+04  3.098e+04   0.928 0.353243    
## zipcode98029   1.921e+05  2.398e+04   8.010 1.21e-15 ***
## zipcode98030   1.095e+03  1.416e+04   0.077 0.938348    
## zipcode98031   3.575e+03  1.474e+04   0.242 0.808397    
## zipcode98032  -7.741e+03  1.712e+04  -0.452 0.651253    
## zipcode98033   2.933e+05  2.658e+04  11.035  < 2e-16 ***
## zipcode98034   1.230e+05  2.850e+04   4.317 1.59e-05 ***
## zipcode98038   4.720e+04  1.590e+04   2.969 0.002994 ** 
## zipcode98039   1.251e+06  3.164e+04  39.522  < 2e-16 ***
## zipcode98040   4.576e+05  2.071e+04  22.094  < 2e-16 ***
## zipcode98042   1.001e+04  1.355e+04   0.739 0.460060    
## zipcode98045   1.249e+05  2.938e+04   4.251 2.14e-05 ***
## zipcode98052   1.673e+05  2.713e+04   6.166 7.13e-10 ***
## zipcode98053   1.425e+05  2.907e+04   4.904 9.48e-07 ***
## zipcode98055   2.101e+04  1.643e+04   1.279 0.200851    
## zipcode98056   6.305e+04  1.785e+04   3.532 0.000413 ***
## zipcode98058   1.335e+04  1.552e+04   0.861 0.389516    
## zipcode98059   5.996e+04  1.751e+04   3.425 0.000616 ***
## zipcode98065   8.599e+04  2.708e+04   3.175 0.001499 ** 
## zipcode98070  -6.859e+04  2.066e+04  -3.319 0.000904 ***
## zipcode98072   7.303e+04  3.173e+04   2.301 0.021380 *  
## zipcode98074   1.328e+05  2.570e+04   5.169 2.37e-07 ***
## zipcode98075   1.350e+05  2.471e+04   5.463 4.75e-08 ***
## zipcode98077   5.075e+04  3.302e+04   1.537 0.124307    
## zipcode98092  -2.432e+04  1.288e+04  -1.889 0.058939 .  
## zipcode98102   4.413e+05  2.741e+04  16.099  < 2e-16 ***
## zipcode98103   2.532e+05  2.571e+04   9.847  < 2e-16 ***
## zipcode98105   3.914e+05  2.641e+04  14.820  < 2e-16 ***
## zipcode98106   8.882e+04  1.905e+04   4.662 3.16e-06 ***
## zipcode98107   2.586e+05  2.651e+04   9.754  < 2e-16 ***
## zipcode98108   7.375e+04  2.103e+04   3.506 0.000455 ***
## zipcode98109   4.182e+05  2.731e+04  15.315  < 2e-16 ***
## zipcode98112   5.478e+05  2.423e+04  22.603  < 2e-16 ***
## zipcode98115   2.491e+05  2.613e+04   9.534  < 2e-16 ***
## zipcode98116   2.188e+05  2.126e+04  10.290  < 2e-16 ***
## zipcode98117   2.264e+05  2.646e+04   8.554  < 2e-16 ***
## zipcode98118   1.212e+05  1.857e+04   6.523 7.04e-11 ***
## zipcode98119   3.960e+05  2.579e+04  15.350  < 2e-16 ***
## zipcode98122   2.724e+05  2.301e+04  11.837  < 2e-16 ***
## zipcode98125   1.128e+05  2.822e+04   3.999 6.39e-05 ***
## zipcode98126   1.328e+05  1.954e+04   6.795 1.11e-11 ***
## zipcode98133   6.827e+04  2.913e+04   2.343 0.019120 *  
## zipcode98136   1.827e+05  2.002e+04   9.127  < 2e-16 ***
## zipcode98144   2.178e+05  2.139e+04  10.184  < 2e-16 ***
## zipcode98146   5.658e+04  1.787e+04   3.166 0.001549 ** 
## zipcode98148   3.643e+04  2.431e+04   1.498 0.134073    
## zipcode98155   4.950e+04  3.030e+04   1.634 0.102373    
## zipcode98166   1.402e+04  1.636e+04   0.857 0.391311    
## zipcode98168   3.790e+04  1.729e+04   2.192 0.028354 *  
## zipcode98177   1.140e+05  3.041e+04   3.747 0.000179 ***
## zipcode98178   5.822e+03  1.786e+04   0.326 0.744381    
## zipcode98188   6.150e+03  1.833e+04   0.336 0.737168    
## zipcode98198  -2.439e+04  1.389e+04  -1.756 0.079096 .  
## zipcode98199   2.988e+05  2.512e+04  11.892  < 2e-16 ***
## lat            2.102e+05  6.314e+04   3.330 0.000871 ***
## long          -1.313e+05  4.535e+04  -2.896 0.003778 ** 
## sqft_living15  1.067e+01  2.891e+00   3.692 0.000223 ***
## sqft_lot15    -1.378e-01  6.027e-02  -2.287 0.022227 *  
## year2015       6.119e+04  7.382e+03   8.289  < 2e-16 ***
## month02        5.761e+03  6.847e+03   0.841 0.400128    
## month03        2.664e+04  6.323e+03   4.214 2.52e-05 ***
## month04        3.245e+04  6.151e+03   5.275 1.34e-07 ***
## month05        4.791e+04  8.136e+03   5.889 3.94e-09 ***
## month06        5.923e+04  9.622e+03   6.155 7.63e-10 ***
## month07        5.325e+04  9.613e+03   5.540 3.07e-08 ***
## month08        5.679e+04  9.698e+03   5.855 4.84e-09 ***
## month09        5.081e+04  9.757e+03   5.208 1.92e-07 ***
## month10        5.351e+04  9.724e+03   5.503 3.77e-08 ***
## month11        5.425e+04  9.954e+03   5.450 5.10e-08 ***
## month12        5.545e+04  9.914e+03   5.593 2.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160000 on 21504 degrees of freedom
## Multiple R-squared:  0.811,  Adjusted R-squared:   0.81 
## F-statistic: 854.4 on 108 and 21504 DF,  p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start:  AIC=553875.8
## price ~ 1
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living    1 1.4356e+15 1.4773e+15 539204
## + grade          1 1.2976e+15 1.6153e+15 541134
## + zipcode       69 1.1867e+15 1.7262e+15 542706
## + sqft_above     1 1.0682e+15 1.8447e+15 544004
## + sqft_living15  1 9.9816e+14 1.9148e+15 544810
## + bathrooms      1 8.0329e+14 2.1096e+15 546904
## + view           1 4.5978e+14 2.4531e+15 550165
## + bedrooms      12 3.1033e+14 2.6026e+15 551465
## + sqft_basement  1 3.0544e+14 2.6075e+15 551484
## + lat            1 2.7455e+14 2.6384e+15 551738
## + waterfront     1 2.0668e+14 2.7062e+15 552287
## + floors         1 1.9209e+14 2.7208e+15 552403
## + yr_renovated   1 4.6564e+13 2.8664e+15 553529
## + sqft_lot       1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15     1 1.9801e+13 2.8931e+15 553730
## + yr_built       1 8.4977e+12 2.9044e+15 553815
## + condition      1 3.8514e+12 2.9091e+15 553849
## + month         11 4.6632e+12 2.9083e+15 553863
## + long           1 1.3624e+12 2.9116e+15 553868
## <none>                        2.9129e+15 553876
## + year           1 3.7251e+10 2.9129e+15 553878
## 
## Step:  AIC=539203.5
## price ~ sqft_living
## 
##                 Df  Sum of Sq        RSS    AIC
## + zipcode       69 6.9104e+14 7.8624e+14 525710
## + lat            1 2.1314e+14 1.2641e+15 535838
## + view           1 1.2362e+14 1.3537e+15 537317
## + grade          1 1.2132e+14 1.3560e+15 537353
## + waterfront     1 1.1024e+14 1.3670e+15 537529
## + yr_built       1 9.2854e+13 1.3844e+15 537802
## + long           1 6.6817e+13 1.4105e+15 538205
## + bedrooms      12 5.8596e+13 1.4187e+15 538353
## + yr_renovated   1 2.2405e+13 1.4549e+15 538875
## + sqft_living15  1 2.0109e+13 1.4572e+15 538909
## + condition      1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15     1 6.4407e+12 1.4708e+15 539111
## + sqft_lot       1 3.0113e+12 1.4743e+15 539161
## + month         11 4.0868e+12 1.4732e+15 539166
## + year           1 1.6739e+12 1.4756e+15 539181
## + sqft_above     1 1.2165e+12 1.4761e+15 539188
## + sqft_basement  1 1.2165e+12 1.4761e+15 539188
## + floors         1 2.2991e+11 1.4770e+15 539202
## + bathrooms      1 1.4719e+11 1.4771e+15 539203
## <none>                        1.4773e+15 539204
## 
## Step:  AIC=525710.2
## price ~ sqft_living + zipcode
## 
##                 Df  Sum of Sq        RSS    AIC
## + waterfront     1 1.1577e+14 6.7047e+14 522270
## + view           1 9.4436e+13 6.9180e+14 522947
## + grade          1 4.2251e+13 7.4398e+14 524518
## + bedrooms      12 2.7266e+13 7.5897e+14 524971
## + sqft_living15  1 1.6595e+13 7.6964e+14 525251
## + sqft_above     1 8.9744e+12 7.7726e+14 525464
## + sqft_basement  1 8.9744e+12 7.7726e+14 525464
## + yr_renovated   1 4.6850e+12 7.8155e+14 525583
## + year           1 3.9193e+12 7.8232e+14 525604
## + condition      1 3.8480e+12 7.8239e+14 525606
## + yr_built       1 3.6430e+12 7.8259e+14 525612
## + month         11 3.9947e+12 7.8224e+14 525622
## + sqft_lot       1 2.9632e+12 7.8327e+14 525631
## + sqft_lot15     1 1.2435e+12 7.8499e+14 525678
## + long           1 7.0269e+11 7.8553e+14 525693
## + floors         1 5.5501e+11 7.8568e+14 525697
## + lat            1 1.1386e+11 7.8612e+14 525709
## <none>                        7.8624e+14 525710
## + bathrooms      1 9.9318e+09 7.8623e+14 525712
## 
## Step:  AIC=522269.7
## price ~ sqft_living + zipcode + waterfront
## 
##                 Df  Sum of Sq        RSS    AIC
## + grade          1 3.9096e+13 6.3137e+14 520973
## + view           1 3.7180e+13 6.3329e+14 521039
## + bedrooms      12 1.8522e+13 6.5195e+14 521688
## + sqft_living15  1 1.3958e+13 6.5651e+14 521817
## + sqft_above     1 1.0767e+13 6.5970e+14 521922
## + sqft_basement  1 1.0767e+13 6.5970e+14 521922
## + year           1 3.8981e+12 6.6657e+14 522146
## + sqft_lot       1 3.5114e+12 6.6696e+14 522158
## + condition      1 3.3423e+12 6.6713e+14 522164
## + month         11 3.9016e+12 6.6657e+14 522166
## + yr_built       1 1.9059e+12 6.6856e+14 522210
## + yr_renovated   1 1.6654e+12 6.6880e+14 522218
## + sqft_lot15     1 1.5408e+12 6.6893e+14 522222
## + floors         1 6.3045e+11 6.6984e+14 522251
## + lat            1 4.0376e+11 6.7006e+14 522259
## + long           1 2.3133e+11 6.7024e+14 522264
## <none>                        6.7047e+14 522270
## + bathrooms      1 3.8057e+06 6.7047e+14 522272
## 
## Step:  AIC=520973.2
## price ~ sqft_living + zipcode + waterfront + grade
## 
##                 Df  Sum of Sq        RSS    AIC
## + view           1 3.1697e+13 5.9967e+14 519862
## + yr_built       1 1.8348e+13 6.1302e+14 520338
## + bedrooms      12 1.2047e+13 6.1933e+14 520581
## + condition      1 8.7967e+12 6.2257e+14 520672
## + floors         1 7.9792e+12 6.2339e+14 520700
## + sqft_living15  1 4.2135e+12 6.2716e+14 520830
## + year           1 3.9998e+12 6.2737e+14 520838
## + sqft_lot       1 3.5158e+12 6.2786e+14 520854
## + month         11 3.7544e+12 6.2762e+14 520866
## + yr_renovated   1 2.5487e+12 6.2882e+14 520888
## + sqft_above     1 2.1336e+12 6.2924e+14 520902
## + sqft_basement  1 2.1336e+12 6.2924e+14 520902
## + sqft_lot15     1 1.6165e+12 6.2976e+14 520920
## + bathrooms      1 1.6091e+12 6.2976e+14 520920
## + lat            1 2.7263e+11 6.3110e+14 520966
## + long           1 1.4112e+11 6.3123e+14 520970
## <none>                        6.3137e+14 520973
## 
## Step:  AIC=519861.9
## price ~ sqft_living + zipcode + waterfront + grade + view
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_built       1 1.3437e+13 5.8624e+14 519374
## + bedrooms      12 9.3922e+12 5.9028e+14 519545
## + condition      1 7.1374e+12 5.9254e+14 519605
## + sqft_above     1 5.4833e+12 5.9419e+14 519665
## + sqft_basement  1 5.4833e+12 5.9419e+14 519665
## + floors         1 5.2288e+12 5.9445e+14 519675
## + year           1 3.7044e+12 5.9597e+14 519730
## + month         11 3.5595e+12 5.9611e+14 519755
## + sqft_lot       1 2.6659e+12 5.9701e+14 519768
## + yr_renovated   1 1.8495e+12 5.9782e+14 519797
## + sqft_living15  1 1.2617e+12 5.9841e+14 519818
## + bathrooms      1 1.1539e+12 5.9852e+14 519822
## + sqft_lot15     1 1.1022e+12 5.9857e+14 519824
## + lat            1 3.4922e+11 5.9933e+14 519851
## + long           1 9.9357e+10 5.9957e+14 519860
## <none>                        5.9967e+14 519862
## 
## Step:  AIC=519374.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built
## 
##                 Df  Sum of Sq        RSS    AIC
## + bedrooms      12 9.4873e+12 5.7675e+14 519046
## + sqft_above     1 6.8761e+12 5.7936e+14 519121
## + sqft_basement  1 6.8761e+12 5.7936e+14 519121
## + year           1 3.8284e+12 5.8241e+14 519235
## + month         11 3.6431e+12 5.8259e+14 519261
## + condition      1 2.4804e+12 5.8376e+14 519285
## + sqft_lot       1 1.7107e+12 5.8453e+14 519313
## + sqft_living15  1 7.2759e+11 5.8551e+14 519349
## + floors         1 5.9549e+11 5.8564e+14 519354
## + sqft_lot15     1 5.5026e+11 5.8569e+14 519356
## + lat            1 3.2692e+11 5.8591e+14 519364
## + yr_renovated   1 3.1372e+11 5.8592e+14 519365
## + bathrooms      1 2.1204e+11 5.8603e+14 519368
## + long           1 7.6113e+10 5.8616e+14 519373
## <none>                        5.8624e+14 519374
## 
## Step:  AIC=519045.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_above     1 6.5830e+12 5.7017e+14 518799
## + sqft_basement  1 6.5830e+12 5.7017e+14 518799
## + year           1 3.8749e+12 5.7288e+14 518902
## + month         11 3.7848e+12 5.7297e+14 518925
## + condition      1 2.9803e+12 5.7377e+14 518936
## + bathrooms      1 1.2558e+12 5.7549e+14 519000
## + sqft_lot       1 1.2079e+12 5.7554e+14 519002
## + sqft_living15  1 5.9721e+11 5.7615e+14 519025
## + floors         1 3.9660e+11 5.7635e+14 519033
## + yr_renovated   1 2.9137e+11 5.7646e+14 519037
## + lat            1 2.8547e+11 5.7646e+14 519037
## + sqft_lot15     1 2.5491e+11 5.7650e+14 519038
## + long           1 1.0990e+11 5.7664e+14 519043
## <none>                        5.7675e+14 519046
## 
## Step:  AIC=518799.4
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above
## 
##                 Df  Sum of Sq        RSS    AIC
## + condition      1 4.2995e+12 5.6587e+14 518638
## + floors         1 4.2484e+12 5.6592e+14 518640
## + year           1 3.8461e+12 5.6632e+14 518655
## + month         11 3.8309e+12 5.6634e+14 518676
## + bathrooms      1 1.5181e+12 5.6865e+14 518744
## + sqft_lot       1 1.1020e+12 5.6907e+14 518760
## + lat            1 3.3323e+11 5.6983e+14 518789
## + sqft_living15  1 2.6109e+11 5.6991e+14 518791
## + yr_renovated   1 2.3990e+11 5.6993e+14 518792
## + sqft_lot15     1 2.2787e+11 5.6994e+14 518793
## + long           1 1.5276e+11 5.7001e+14 518796
## <none>                        5.7017e+14 518799
## 
## Step:  AIC=518637.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition
## 
##                 Df  Sum of Sq        RSS    AIC
## + year           1 4.2813e+12 5.6159e+14 518476
## + floors         1 4.0451e+12 5.6182e+14 518485
## + month         11 4.1925e+12 5.6168e+14 518499
## + bathrooms      1 1.2895e+12 5.6458e+14 518590
## + sqft_lot       1 1.1403e+12 5.6473e+14 518596
## + yr_renovated   1 7.4627e+11 5.6512e+14 518611
## + lat            1 4.1043e+11 5.6546e+14 518624
## + sqft_living15  1 3.7407e+11 5.6549e+14 518626
## + sqft_lot15     1 2.2618e+11 5.6564e+14 518631
## + long           1 1.5578e+11 5.6571e+14 518634
## <none>                        5.6587e+14 518638
## 
## Step:  AIC=518475.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year
## 
##                 Df  Sum of Sq        RSS    AIC
## + floors         1 3.9090e+12 5.5768e+14 518327
## + bathrooms      1 1.3385e+12 5.6025e+14 518426
## + month         11 1.6235e+12 5.5996e+14 518435
## + sqft_lot       1 1.0929e+12 5.6049e+14 518436
## + yr_renovated   1 8.6150e+11 5.6072e+14 518444
## + sqft_living15  1 3.8254e+11 5.6120e+14 518463
## + lat            1 3.8029e+11 5.6121e+14 518463
## + sqft_lot15     1 2.1435e+11 5.6137e+14 518469
## + long           1 1.6422e+11 5.6142e+14 518471
## <none>                        5.6159e+14 518476
## 
## Step:  AIC=518326.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors
## 
##                 Df  Sum of Sq        RSS    AIC
## + bathrooms      1 2.6150e+12 5.5506e+14 518227
## + yr_renovated   1 1.3500e+12 5.5633e+14 518276
## + month         11 1.6328e+12 5.5604e+14 518285
## + sqft_lot       1 8.9860e+11 5.5678e+14 518294
## + lat            1 3.1513e+11 5.5736e+14 518316
## + sqft_living15  1 2.3140e+11 5.5745e+14 518320
## + long           1 2.0375e+11 5.5747e+14 518321
## + sqft_lot15     1 1.1877e+11 5.5756e+14 518324
## <none>                        5.5768e+14 518327
## 
## Step:  AIC=518227.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + month         11 1.6541e+12 5.5341e+14 518185
## + sqft_lot       1 9.1786e+11 5.5414e+14 518193
## + yr_renovated   1 8.9343e+11 5.5417e+14 518194
## + lat            1 3.1540e+11 5.5475e+14 518217
## + sqft_living15  1 2.6851e+11 5.5479e+14 518219
## + long           1 1.9322e+11 5.5487e+14 518222
## + sqft_lot15     1 1.4455e+11 5.5492e+14 518223
## <none>                        5.5506e+14 518227
## 
## Step:  AIC=518184.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_lot       1 9.1841e+11 5.5249e+14 518151
## + yr_renovated   1 9.0279e+11 5.5251e+14 518151
## + lat            1 3.1894e+11 5.5309e+14 518174
## + sqft_living15  1 2.4979e+11 5.5316e+14 518177
## + long           1 1.9370e+11 5.5321e+14 518179
## + sqft_lot15     1 1.2745e+11 5.5328e+14 518182
## <none>                        5.5341e+14 518185
## 
## Step:  AIC=518150.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_renovated   1 9.2234e+11 5.5157e+14 518117
## + lat            1 3.3892e+11 5.5215e+14 518139
## + sqft_living15  1 2.6370e+11 5.5223e+14 518142
## + long           1 2.6016e+11 5.5223e+14 518143
## + sqft_lot15     1 1.2331e+11 5.5237e+14 518148
## <none>                        5.5249e+14 518151
## 
## Step:  AIC=518116.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot + yr_renovated
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living15  1 3.3413e+11 5.5123e+14 518106
## + lat            1 3.2969e+11 5.5124e+14 518106
## + long           1 2.7642e+11 5.5129e+14 518108
## + sqft_lot15     1 1.2587e+11 5.5144e+14 518114
## <none>                        5.5157e+14 518117
## 
## Step:  AIC=518105.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot + yr_renovated + sqft_living15
## 
##              Df  Sum of Sq        RSS    AIC
## + lat         1 3.2184e+11 5.5091e+14 518095
## + long        1 2.7257e+11 5.5096e+14 518097
## + sqft_lot15  1 1.5383e+11 5.5108e+14 518101
## <none>                     5.5123e+14 518106
## 
## Step:  AIC=518094.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot + yr_renovated + sqft_living15 + lat
## 
##              Df  Sum of Sq        RSS    AIC
## + long        1 2.3388e+11 5.5068e+14 518088
## + sqft_lot15  1 1.5297e+11 5.5076e+14 518091
## <none>                     5.5091e+14 518095
## 
## Step:  AIC=518087.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot + yr_renovated + sqft_living15 + lat + long
## 
##              Df  Sum of Sq        RSS    AIC
## + sqft_lot15  1 1.3387e+11 5.5054e+14 518084
## <none>                     5.5068e+14 518088
## 
## Step:  AIC=518084.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built + 
##     bedrooms + sqft_above + condition + year + floors + bathrooms + 
##     month + sqft_lot + yr_renovated + sqft_living15 + lat + long + 
##     sqft_lot15
## 
##        Df Sum of Sq        RSS    AIC
## <none>              5.5054e+14 518084
## 
## Call:
## lm(formula = price ~ sqft_living + zipcode + waterfront + grade + 
##     view + yr_built + bedrooms + sqft_above + condition + year + 
##     floors + bathrooms + month + sqft_lot + yr_renovated + sqft_living15 + 
##     lat + long + sqft_lot15, data = df)
## 
## Coefficients:
##   (Intercept)    sqft_living   zipcode98002   zipcode98003   zipcode98004  
##    -2.517e+07      1.305e+02      3.135e+04     -2.400e+04      7.159e+05  
##  zipcode98005   zipcode98006   zipcode98007   zipcode98008   zipcode98010  
##     2.477e+05      2.278e+05      1.950e+05      2.055e+05      9.688e+04  
##  zipcode98011   zipcode98014   zipcode98019   zipcode98022   zipcode98023  
##     3.540e+04      7.830e+04      3.875e+04      4.171e+04     -4.618e+04  
##  zipcode98024   zipcode98027   zipcode98028   zipcode98029   zipcode98030  
##     1.483e+05      1.512e+05      2.876e+04      1.921e+05      1.095e+03  
##  zipcode98031   zipcode98032   zipcode98033   zipcode98034   zipcode98038  
##     3.575e+03     -7.741e+03      2.933e+05      1.230e+05      4.720e+04  
##  zipcode98039   zipcode98040   zipcode98042   zipcode98045   zipcode98052  
##     1.251e+06      4.576e+05      1.001e+04      1.249e+05      1.673e+05  
##  zipcode98053   zipcode98055   zipcode98056   zipcode98058   zipcode98059  
##     1.425e+05      2.101e+04      6.305e+04      1.335e+04      5.996e+04  
##  zipcode98065   zipcode98070   zipcode98072   zipcode98074   zipcode98075  
##     8.599e+04     -6.859e+04      7.303e+04      1.328e+05      1.350e+05  
##  zipcode98077   zipcode98092   zipcode98102   zipcode98103   zipcode98105  
##     5.075e+04     -2.432e+04      4.413e+05      2.532e+05      3.914e+05  
##  zipcode98106   zipcode98107   zipcode98108   zipcode98109   zipcode98112  
##     8.882e+04      2.586e+05      7.375e+04      4.182e+05      5.478e+05  
##  zipcode98115   zipcode98116   zipcode98117   zipcode98118   zipcode98119  
##     2.491e+05      2.188e+05      2.264e+05      1.212e+05      3.960e+05  
##  zipcode98122   zipcode98125   zipcode98126   zipcode98133   zipcode98136  
##     2.724e+05      1.128e+05      1.328e+05      6.827e+04      1.827e+05  
##  zipcode98144   zipcode98146   zipcode98148   zipcode98155   zipcode98166  
##     2.178e+05      5.658e+04      3.643e+04      4.950e+04      1.402e+04  
##  zipcode98168   zipcode98177   zipcode98178   zipcode98188   zipcode98198  
##     3.790e+04      1.140e+05      5.822e+03      6.150e+03     -2.439e+04  
##  zipcode98199     waterfront          grade           view       yr_built  
##     2.988e+05      6.547e+05      5.746e+04      5.477e+04     -7.051e+02  
##     bedrooms1      bedrooms2      bedrooms3      bedrooms4      bedrooms5  
##    -3.878e+04     -8.181e+04     -1.132e+05     -1.499e+05     -1.608e+05  
##     bedrooms6      bedrooms7      bedrooms8      bedrooms9     bedrooms10  
##    -1.756e+05     -2.787e+05     -1.483e+05     -3.662e+05     -4.744e+05  
##    bedrooms11     bedrooms33     sqft_above      condition       year2015  
##    -2.723e+05     -6.439e+04      7.994e+01      2.702e+04      6.119e+04  
##        floors      bathrooms        month02        month03        month04  
##    -4.471e+04      2.424e+04      5.761e+03      2.664e+04      3.245e+04  
##       month05        month06        month07        month08        month09  
##     4.791e+04      5.923e+04      5.325e+04      5.679e+04      5.081e+04  
##       month10        month11        month12       sqft_lot   yr_renovated  
##     5.351e+04      5.425e+04      5.545e+04      2.422e-01      1.839e+01  
## sqft_living15            lat           long     sqft_lot15  
##     1.067e+01      2.102e+05     -1.313e+05     -1.378e-01
library(MASS)
ind <- sapply(df, is.numeric)
df[ind] <- lapply(df[ind], scale)
lm4<-lm(price~ ( grade + zipcode + sqft_living + waterfront + view + condition + 
    year + yr_renovated + sqft_above + bedrooms:bathrooms ),data=df)
summary(lm4)
## 
## Call:
## lm(formula = price ~ (grade + zipcode + sqft_living + waterfront + 
##     view + condition + year + yr_renovated + sqft_above + bedrooms:bathrooms), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5289 -0.1884 -0.0004  0.1690 11.0437 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.583250   0.023151 -25.194  < 2e-16 ***
## grade                 0.194013   0.005406  35.890  < 2e-16 ***
## zipcode98002          0.044481   0.038674   1.150 0.250098    
## zipcode98003         -0.026238   0.034812  -0.754 0.451026    
## zipcode98004          2.134094   0.034037  62.699  < 2e-16 ***
## zipcode98005          0.868687   0.041076  21.148  < 2e-16 ***
## zipcode98006          0.730462   0.030683  23.807  < 2e-16 ***
## zipcode98007          0.668774   0.043505  15.372  < 2e-16 ***
## zipcode98008          0.701716   0.034801  20.163  < 2e-16 ***
## zipcode98010          0.197724   0.049435   4.000 6.36e-05 ***
## zipcode98011          0.363206   0.038880   9.342  < 2e-16 ***
## zipcode98014          0.339819   0.045545   7.461 8.91e-14 ***
## zipcode98019          0.249683   0.039215   6.367 1.97e-10 ***
## zipcode98022         -0.001084   0.036823  -0.029 0.976509    
## zipcode98023         -0.079214   0.030206  -2.623 0.008735 ** 
## zipcode98024          0.499822   0.053794   9.291  < 2e-16 ***
## zipcode98027          0.465261   0.031644  14.703  < 2e-16 ***
## zipcode98028          0.346521   0.034712   9.983  < 2e-16 ***
## zipcode98029          0.569500   0.033739  16.880  < 2e-16 ***
## zipcode98030          0.013597   0.035706   0.381 0.703346    
## zipcode98031          0.040355   0.035027   1.152 0.249288    
## zipcode98032          0.005807   0.045397   0.128 0.898223    
## zipcode98033          0.988031   0.031260  31.607  < 2e-16 ***
## zipcode98034          0.567286   0.029668  19.121  < 2e-16 ***
## zipcode98038          0.089311   0.029262   3.052 0.002275 ** 
## zipcode98039          3.606290   0.066500  54.230  < 2e-16 ***
## zipcode98040          1.372971   0.035335  38.856  < 2e-16 ***
## zipcode98042          0.005763   0.029634   0.194 0.845794    
## zipcode98045          0.283421   0.037341   7.590 3.33e-14 ***
## zipcode98052          0.630367   0.029478  21.384  < 2e-16 ***
## zipcode98053          0.588771   0.031845  18.489  < 2e-16 ***
## zipcode98055          0.138771   0.035241   3.938 8.25e-05 ***
## zipcode98056          0.243115   0.031656   7.680 1.66e-14 ***
## zipcode98058          0.081937   0.030814   2.659 0.007840 ** 
## zipcode98059          0.217831   0.030694   7.097 1.32e-12 ***
## zipcode98065          0.230533   0.034011   6.778 1.25e-11 ***
## zipcode98070          0.003081   0.046971   0.066 0.947709    
## zipcode98072          0.449663   0.035135  12.798  < 2e-16 ***
## zipcode98074          0.490007   0.031300  15.655  < 2e-16 ***
## zipcode98075          0.455420   0.032995  13.803  < 2e-16 ***
## zipcode98077          0.379010   0.038965   9.727  < 2e-16 ***
## zipcode98092         -0.096031   0.032791  -2.929 0.003408 ** 
## zipcode98102          1.413984   0.048699  29.035  < 2e-16 ***
## zipcode98103          0.907223   0.029250  31.017  < 2e-16 ***
## zipcode98105          1.306566   0.037107  35.211  < 2e-16 ***
## zipcode98106          0.379891   0.033298  11.409  < 2e-16 ***
## zipcode98107          0.922281   0.035489  25.988  < 2e-16 ***
## zipcode98108          0.350428   0.039536   8.863  < 2e-16 ***
## zipcode98109          1.391383   0.047933  29.028  < 2e-16 ***
## zipcode98112          1.715699   0.035528  48.291  < 2e-16 ***
## zipcode98115          0.906588   0.029426  30.809  < 2e-16 ***
## zipcode98116          0.803669   0.033508  23.984  < 2e-16 ***
## zipcode98117          0.877180   0.029738  29.497  < 2e-16 ***
## zipcode98118          0.463000   0.030211  15.325  < 2e-16 ***
## zipcode98119          1.304632   0.039851  32.738  < 2e-16 ***
## zipcode98122          0.939959   0.034654  27.124  < 2e-16 ***
## zipcode98125          0.550941   0.031603  17.433  < 2e-16 ***
## zipcode98126          0.530454   0.032900  16.123  < 2e-16 ***
## zipcode98133          0.438414   0.030334  14.453  < 2e-16 ***
## zipcode98136          0.676487   0.035633  18.985  < 2e-16 ***
## zipcode98144          0.755502   0.033110  22.818  < 2e-16 ***
## zipcode98146          0.302789   0.034637   8.742  < 2e-16 ***
## zipcode98148          0.231965   0.062308   3.723 0.000197 ***
## zipcode98155          0.406197   0.030993  13.106  < 2e-16 ***
## zipcode98166          0.191745   0.035942   5.335 9.66e-08 ***
## zipcode98168          0.217740   0.035375   6.155 7.63e-10 ***
## zipcode98177          0.632605   0.035980  17.582  < 2e-16 ***
## zipcode98178          0.123924   0.035616   3.479 0.000503 ***
## zipcode98188          0.095027   0.043989   2.160 0.030764 *  
## zipcode98198          0.004311   0.034887   0.124 0.901649    
## zipcode98199          1.080174   0.033948  31.818  < 2e-16 ***
## sqft_living           0.343773   0.007743  44.400  < 2e-16 ***
## waterfront            0.155790   0.003317  46.962  < 2e-16 ***
## view                  0.123274   0.003582  34.410  < 2e-16 ***
## condition             0.066713   0.003204  20.822  < 2e-16 ***
## year2015              0.082389   0.006390  12.894  < 2e-16 ***
## yr_renovated          0.032288   0.003048  10.592  < 2e-16 ***
## sqft_above            0.109962   0.007161  15.355  < 2e-16 ***
## bedrooms0:bathrooms  -0.096997   0.056883  -1.705 0.088172 .  
## bedrooms1:bathrooms  -0.190070   0.022267  -8.536  < 2e-16 ***
## bedrooms2:bathrooms  -0.130688   0.008587 -15.219  < 2e-16 ***
## bedrooms3:bathrooms  -0.073400   0.006145 -11.944  < 2e-16 ***
## bedrooms4:bathrooms   0.047023   0.007307   6.435 1.26e-10 ***
## bedrooms5:bathrooms   0.127393   0.009347  13.629  < 2e-16 ***
## bedrooms6:bathrooms   0.192241   0.015044  12.778  < 2e-16 ***
## bedrooms7:bathrooms  -0.120702   0.027079  -4.457 8.34e-06 ***
## bedrooms8:bathrooms   0.152619   0.053974   2.828 0.004693 ** 
## bedrooms9:bathrooms  -0.138864   0.050116  -2.771 0.005596 ** 
## bedrooms10:bathrooms -0.347320   0.103494  -3.356 0.000792 ***
## bedrooms11:bathrooms -0.199408   0.381040  -0.523 0.600753    
## bedrooms33:bathrooms -0.471070   0.923604  -0.510 0.610032    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.437 on 21522 degrees of freedom
## Multiple R-squared:  0.8098, Adjusted R-squared:  0.809 
## F-statistic:  1018 on 90 and 21522 DF,  p-value: < 2.2e-16
library(MASS)
ind <- sapply(df, is.numeric)
df[ind] <- lapply(df[ind], scale)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.3
library(rpart.plot)
regTree <- rpart(price ~ .+ sqft_living + zipcode + waterfront + view + yr_built+yr_renovated + bedrooms+bathrooms + sqft_above + condition +floors, data= df, method = "anova")

Regression tree

plot(regTree,uniform = TRUE,main=" Regression tree")
text(regTree,use.n=TRUE,cex=.6)
## Warning in labels.rpart(x, minlength = minlength): more than 52 levels in a
## predicting factor, truncated for printout

par(mfrow=c(1,2))
rsq.rpart(regTree)
## 
## Regression tree:
## rpart(formula = price ~ . + sqft_living + zipcode + waterfront + 
##     view + yr_built + yr_renovated + bedrooms + bathrooms + sqft_above + 
##     condition + floors, data = df, method = "anova")
## 
## Variables actually used in tree construction:
## [1] grade       sqft_living zipcode    
## 
## Root node error: 21612/21613 = 0.99995
## 
## n= 21613 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.320270      0   1.00000 1.00016 0.041152
## 2  0.114782      1   0.67973 0.68012 0.032949
## 3  0.078560      2   0.56495 0.56908 0.024670
## 4  0.052578      3   0.48639 0.49051 0.024574
## 5  0.049445      4   0.43381 0.47198 0.024392
## 6  0.031927      5   0.38436 0.40071 0.019704
## 7  0.020928      6   0.35244 0.39072 0.017950
## 8  0.017918      7   0.33151 0.37025 0.017848
## 9  0.012805      8   0.31359 0.34132 0.013040
## 10 0.011242      9   0.30079 0.32594 0.012935
## 11 0.010000     10   0.28955 0.31565 0.012888

Compare the output between these two methods. Is there one that you would choose over the other?

Answer: From this, we get to know thar the std error for regrssion trees is 0.31 which less than that of the Linear regression model which is 0.40. Thus, regression tree model has reduced its error & is preffered to the other.


                                       # PART 2
                                        
                                      
Auto= read.csv('Auto.csv')

Creating a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function.

library(MASS)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.2
## 
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)

Exploring the data graphically in order to investigate the association between mpg01 and the other features.

cor(Auto[,-9])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000
#Scatterplot matrix
pairs(Auto[,-9])

in scatter plot horsepower and weight have high significance with year

#Boxplots
par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")

some association between “mpg01” and “cylinders”, “weight”, “displacement” and “horsepower” and acceleration look the most promissing.

# splitting the train and test set into 80% and 20%
set.seed(1)
rows <- sample(x=nrow(Auto), size=.80*nrow(Auto))
trainset <- Auto[rows, ]
testset <- Auto[-rows, ]
# LDA
library(MASS)
lda.fit <- lda(mpg01 ~ (displacement+acceleration+horsepower:year+weight+cylinders), data=trainset)
lda.pred <- predict(lda.fit, testset)
table(testset$mpg01, lda.pred$class)
##    
##      0  1
##   0 32  3
##   1  0 44
round(sum(lda.pred$class!=testset$mpg01)/nrow(testset)*100,2)
## [1] 3.8

Test errror is 3.8%

#logistic regression
lr.fit <- glm(as.factor(mpg01) ~ (displacement+acceleration+horsepower:year+weight+cylinders), data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>0.5, "1", "0")
table(testset$mpg01, lr.pred)
##    lr.pred
##      0  1
##   0 32  3
##   1  2 42
# test-error
round(sum(lr.pred!=testset$mpg01)/nrow(testset)*100,2)
## [1] 6.33

Test errror is 6.33%

Performing KNN on the training data

data = scale(Auto[,-c(9,10)])
set.seed(1234)
train <- sample(1:dim(Auto)[1], 392*.7, rep=FALSE)
#train <- sample(1:dim(Auto)[1], dim(Auto)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train,c("cylinders","weight","displacement","horsepower","year","acceleration")]
testing_data = data[test,c("cylinders","weight","displacement","horsepower","year","acceleration")]
## KNN take the training response variable seperately
train.mpg01 = Auto$mpg01[train]

## we also need the have the testing_y seperately for assesing the model later on
test.mpg01= Auto$mpg01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 1)
table(knn_pred_y, test.mpg01)
##           test.mpg01
## knn_pred_y  0  1
##          0 51  4
##          1  5 58
mean(knn_pred_y != test.mpg01)
## [1] 0.07627119
#Using a for loop to find the optimum K value
knn_pred_y = NULL
error_rate = NULL
for(i in 1:dim(testing_data)[1]){
  set.seed(1234)
  knn_pred_y = knn(training_data,testing_data,train.mpg01,k=i)
  error_rate[i] = mean(test.mpg01 != knn_pred_y)
}
### find the minimum error rate
min_error_rate = min(round((error_rate)*100,2))
print(min_error_rate)
## [1] 5.93

The minimum error rate is 5.93%

### get the index of that error rate, which is the k
K = which(error_rate == min_error_rate)
print(K)
## integer(0)
# When we train a KNN model with k=3, then we get the lowest misclassification error rate of 5.93%.
library(ggplot2)
qplot(1:dim(testing_data)[1], error_rate, xlab = "K",
      ylab = "Error Rate",
      geom=c("point", "line"))

Which value of K seems to perform the best on this data set? Answer:3 ——————————————————————————————————————————————————————————————————

                                        PART 3
                                        
                                        DBSCAN
data<-read.csv('clustering1.csv')
df2<-read.csv('clustering2.csv')
df3<-read.csv('clustering3.csv')
df4<-read.csv('clustering4.csv')
library(dbscan)
library("fpc")
## Warning: package 'fpc' was built under R version 3.4.3
## 
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
## 
##     dbscan
set.seed(1234)
db1<-fpc::dbscan(data,eps=2,MinPts=80)
db2<-fpc::dbscan(df2,eps=0.5,MinPts=10)
db3<-fpc::dbscan(df3,eps=0.15,MinPts=5)
db4<-fpc::dbscan(df4,eps=0.18,MinPts=10)
print(db1)
## dbscan Pts=1499 MinPts=80 eps=2
##          1   2   3
## border   8  11   7
## seed   491 489 493
## total  499 500 500
print(db2)
## dbscan Pts=1499 MinPts=10 eps=0.5
##          1
## seed  1499
## total 1499
print(db3)
## dbscan Pts=1499 MinPts=5 eps=0.15
##          1   2
## border   1   0
## seed   748 750
## total  749 750
print(db4)
## dbscan Pts=1499 MinPts=10 eps=0.18
##         1   2
## seed  750 749
## total 750 749
library("factoextra")
## Warning: package 'factoextra' was built under R version 3.4.1
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
dp1<-fviz_cluster(db1,data=data,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
                  geom="point",palette="jco",ggtheme=theme_classic())
dp2<-fviz_cluster(db2,data=df2,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
                  geom="point",palette="jco",ggtheme=theme_classic())
dp3<-fviz_cluster(db3,data=df3,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
                  geom="point",palette="jco",ggtheme=theme_classic())
dp4<-fviz_cluster(db4,data=df4,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
                  geom="point",palette="jco",ggtheme=theme_classic())

library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.1
grid.arrange(dp1,dp2,dp3,dp4,nrow=2)

Hierarchical clustering
r data <- na.omit(data) data <- scale(data) head(data)
## X5.867498067335642276e.00 X8.177151880030342213e.00 ## 1 0.3971268 1.24070227 ## 2 0.6611644 1.30575784 ## 3 0.5854193 0.06447004 ## 4 0.7907919 0.18275159 ## 5 0.8539870 -0.05893430 ## 6 0.5443075 1.22563007
r d <- dist(data, method = "euclidean") hc1 <- hclust(d, method = "complete" ) plot(hc1, cex = 0.6, hang = -1)
r segtree<-cutree(hc1,k=3) table(segtree)
## segtree ## 1 2 3 ## 499 500 500
r plot(hc1,cex=0.6) rect.hclust(hc1,k=3,border=2.5)
r fviz_cluster(list(data=data,cluster=segtree))
r d1 <- dist(df2, method = "euclidean") hc2 <- hclust(d1, method = "complete" ) plot(hc2, cex = 0.6, hang = -1)
r segtree1<-cutree(hc2,k=3) table(segtree1)
## segtree1 ## 1 2 3 ## 459 617 423
r plot(hc2,cex=0.6) rect.hclust(hc2,k=3,border=2.5)
r fviz_cluster(list(data=df2,cluster=segtree1))
r d3 <- dist(df3, method = "euclidean") hc3 <- hclust(d3, method = "complete" ) plot(hc3, cex = 0.6, hang = -1)
r segtree3<-cutree(hc3,k=3) table(segtree3)
## segtree3 ## 1 2 3 ## 574 569 356
r plot(hc3,cex=0.6) rect.hclust(hc3,k=3,border=2.5)
r fviz_cluster(list(data=df3,cluster=segtree3))
r d4 <- dist(df4, method = "euclidean") hc4 <- hclust(d4, method = "complete" ) plot(hc4, cex = 0.6, hang = -1)
r segtree4<-cutree(hc4,k=3) table(segtree4)
## segtree4 ## 1 2 3 ## 637 576 286
r plot(hc4,cex=0.6) rect.hclust(hc4,k=3,border=2.5)
r fviz_cluster(list(data=df4,cluster=segtree4))

K-MEANS

k1<-kmeans(data,centers = 4,nstart = 25)
k2<-kmeans(df2,centers = 8,nstart = 25)
k3<-kmeans(df3,centers = 15,nstart = 25)
k4<-kmeans(df4,centers = 20,nstart = 25)
pt1<-fviz_cluster(k1,geom="point",data=data)+ggtitle("Kmeans - Clustering 1 dataset")
pt2<-fviz_cluster(k2,geom="point",data=df2)+ggtitle("Kmeans - Clustering 2 dataset")
pt3<-fviz_cluster(k3,geom="point",data=df3)+ggtitle("Kmeans - Clustering 3 dataset")
pt4<-fviz_cluster(k4,geom="point",data=df4)+ggtitle("Kmeans - Clustering 4 dataset")
library(gridExtra)
grid.arrange(pt1,pt2,pt3,pt4,nrow=2)


                                    PART 4

1.Suppose we have a dataset with five predictors, X1=GPA, X2=IQ, X3=Gender (1 for Female, 0 for Male), X4=Interaction between GPA and IQ, and X5=Interaction between GPA and Gender. The response is the starting salary after graduation (in thousands of dollars). Suppose we use least squares to fit the model, and get β0^=50, β̂ 1=20, β̂ 2=.07, β̂ 3=35, β̂ 4=0.01, and β̂ 5 For a fixed value of IQ and GPA, males earn more on average than females. For a fixed value of IQ and GPA, females earn more on average than males. For a fixed value of IQ and GPA, males earn more on average than females provided that the GPA is high enough. For a fixed value of IQ and GPA, females earn more on average than males provided that the GPA is high enough.

Answer: The correct answer is (iii) i.e. For a fixed value of IQ and GPA,males earn more on average than females provided that the GPA is high enough.If males are 0 and females are 1, then male is the baseline. It’s clear from the sign of β3 that on average women earn more than men if both have zero GPA and zero IQ. However, as GPA increases, average wages become relatively higher for men (β5<0). Therefore, if GPA is high enough, men will earn more than women, on average. if the gpa is low answer b is correct.

B)Predict the salary of a female with IQ of 110 and a GPA of 4.0.

Answer: The formala for linear regression is Y = β[0] +β[1]X +β[2]X^2 +β[3]X^3 + e

which is equal to 85+10GPA+0.07IQ+0.01GPA*IQ

Y=85+104+0.07110+0.014110 y=85+40+7.7+4.4 y=137.1 which is $137100 for the female’s starting salary!

Suppose we collect data for a group of students in a statistics class with variables X1=hours studied, X2=undergrad GPA, and Y=receive an A. We fit a logistic regression and produce estimated coefficient, β̂ 0=−6, β̂ 1=0.05, β̂ 2=1. Estimate the probability that a student who studies for 40 h and has an undergrad GPA of 3.5 gets an A in the class.

Answer: Given:

β0 = -6 β1 = 0.05 β2 = 1 and X1=40 & X2=3.5 Probability = (exp(1)^(β0 + (β140) + (β23.5)))/(1+exp(1)^(β0 + (β140) + (β23.5))) Probability= 37.75%

How many hours would the student in part (a) need to study to have a 50 % chance of getting an A in the class?

Answer: To increase the chance of A without alter the GPA, the student have to increase the number of hours, so we test a sequence of hours and see how the chances change. Doing the solution we find that x1 equal to 50 hours, therefore to have 50% of chance, he needs to study at least 50 hours. ——————————————————————————————————————————————————————————————————

n=6
x=matrix(c(1,4,1,3,0,4,5,1,6,2,4,0),nrow=n,byrow=T)
plot(x)

Randomly assign a cluster label to each observation. Report the cluster labels for each observation.

set.seed(1)
labels <- sample(2, nrow(x), replace = T)
labels
## [1] 1 1 2 2 1 2
plot(x, col = (labels + 1), pch = 20, cex = 2)

Compute the centroid for each cluster.

centroid1 <- c(mean(x[labels == 1, 1]), mean(x[labels == 1, 2]))
centroid2 <- c(mean(x[labels == 2, 1]), mean(x[labels == 2, 2]))
plot(x[,1], x[,2], col=(labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)

Assign each observation to the centroid to which it is closest, in terms of Euclidean distance. Report the cluster labels for each observation.

labels <- c(1, 1, 1, 2, 2, 2)
plot(x[, 1], x[, 2], col = (labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)

Repeat (c) and (d) until the answers obtained stop changing.

centroid1 <- c(mean(x[labels == 1, 1]), mean(x[labels == 1, 2]))
centroid2 <- c(mean(x[labels == 2, 1]), mean(x[labels == 2, 2]))
plot(x[,1], x[,2], col=(labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)

In your plot from (a), color the observations according to the clusters labels obtained.

plot(x[, 1], x[, 2], col=(labels + 1), pch = 20, cex = 2)